Skip to content

Commit f1124c5

Browse files
committed
memory resident limit to enabled peers
1 parent 5c5e364 commit f1124c5

35 files changed

+685
-284
lines changed

sycl/cmake/modules/BuildUnifiedRuntime.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ set(UR_BUILD_EXAMPLES "${SYCL_UR_BUILD_TESTS}" CACHE BOOL "" FORCE)
1616
option(SYCL_UR_FORMAT_CPP_STYLE "Format code style of UR C++ sources" OFF)
1717
set(UR_FORMAT_CPP_STYLE "${SYCL_UR_FORMAT_CPP_STYLE}" CACHE BOOL "" FORCE)
1818

19+
option(SYCL_UR_ENABLE_ASSERTIONS "Enable assertions for all UR build types" OFF)
20+
set(UR_ENABLE_ASSERTIONS "${SYCL_UR_ENABLE_ASSERTIONS}" CACHE BOOL "" FORCE)
21+
1922
# Here we override the defaults to unified-runtime
2023
set(UR_BUILD_XPTI_LIBS OFF CACHE BOOL "")
2124
set(UR_ENABLE_SYMBOLIZER ON CACHE BOOL "Enable symbolizer for sanitizer layer.")

sycl/source/device.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -219,19 +219,27 @@ bool device::has(aspect Aspect) const { return impl->has(Aspect); }
219219
void device::ext_oneapi_enable_peer_access(const device &peer) {
220220
ur_device_handle_t Device = impl->getHandleRef();
221221
ur_device_handle_t Peer = peer.impl->getHandleRef();
222-
if (Device != Peer) {
223-
detail::adapter_impl &Adapter = impl->getAdapter();
224-
Adapter.call<detail::UrApiKind::urUsmP2PEnablePeerAccessExp>(Device, Peer);
222+
223+
if (Device == Peer) return;
224+
225+
if (peer.get_platform() != get_platform()) {
226+
throw exception(errc::invalid, "Can not enable peer access between different platforms");
225227
}
228+
229+
impl->getAdapter().call<detail::UrApiKind::urUsmP2PEnablePeerAccessExp>(Device, Peer);
226230
}
227231

228232
void device::ext_oneapi_disable_peer_access(const device &peer) {
229233
ur_device_handle_t Device = impl->getHandleRef();
230234
ur_device_handle_t Peer = peer.impl->getHandleRef();
231-
if (Device != Peer) {
232-
detail::adapter_impl &Adapter = impl->getAdapter();
233-
Adapter.call<detail::UrApiKind::urUsmP2PDisablePeerAccessExp>(Device, Peer);
235+
236+
if (Device == Peer) return;
237+
238+
if (peer.get_platform() != get_platform()) {
239+
throw exception(errc::invalid, "Can not disable peer access between different platforms");
234240
}
241+
242+
impl->getAdapter().call<detail::UrApiKind::urUsmP2PDisablePeerAccessExp>(Device, Peer);
235243
}
236244

237245
bool device::ext_oneapi_can_access_peer(const device &peer,

unified-runtime/cmake/Assertions.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ if(UR_ENABLE_ASSERTIONS)
88
# MSVC doesn't like _DEBUG on release builds
99
if( NOT MSVC )
1010
add_compile_definitions(_DEBUG)
11+
add_compile_definitions(UR_DASSERT_ENABLED)
1112
endif()
1213
# On non-Debug builds cmake automatically defines NDEBUG, so we
1314
# explicitly undefine it:

unified-runtime/source/adapters/level_zero/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,6 @@ if(UR_BUILD_ADAPTER_L0_V2)
150150
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
151151
${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
152152
${CMAKE_CURRENT_SOURCE_DIR}/helpers/mutable_helpers.cpp
153-
${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
154153
${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
155154
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
156155
${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp
@@ -191,6 +190,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
191190
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp
192191
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.cpp
193192
${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.cpp
193+
${CMAKE_CURRENT_SOURCE_DIR}/v2/usm_p2p.cpp
194194
)
195195
install_ur_library(ur_adapter_level_zero_v2)
196196

unified-runtime/source/adapters/level_zero/adapter.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,7 @@ ur_adapter_handle_t_::ur_adapter_handle_t_()
569569
if (err == UR_RESULT_SUCCESS) {
570570
Platforms = std::move(platforms);
571571
} else {
572+
UR_LOG(ERR, "Failed to initialize Platforms");
572573
throw err;
573574
}
574575
}

unified-runtime/source/adapters/level_zero/common.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,11 @@ void zeParseError(ze_result_t ZeError, const char *&ErrorString);
218218
#define ZE2UR_CALL_THROWS(ZeName, ZeArgs) \
219219
{ \
220220
ze_result_t ZeResult = ZeName ZeArgs; \
221-
if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true)) \
221+
if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true)) { \
222+
UR_DFAILURE("failed ZE call " #ZeName " with " #ZeArgs ", with result:" \
223+
<< Result); \
222224
throw ze2urResult(Result); \
225+
} \
223226
}
224227

225228
// Perform traced call to L0 without checking for errors

unified-runtime/source/adapters/level_zero/context.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,18 @@ ur_result_t urContextCreate(
4242

4343
Context->initialize();
4444
*RetContext = reinterpret_cast<ur_context_handle_t>(Context);
45+
// TODO: delete below 'if' when memory isolation in the context is
46+
// implemented in the driver
4547
if (IndirectAccessTrackingEnabled) {
4648
std::scoped_lock<ur_shared_mutex> Lock(Platform->ContextsMutex);
4749
Platform->Contexts.push_back(*RetContext);
4850
}
4951
} catch (const std::bad_alloc &) {
5052
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
51-
} catch (umf_result_t e) {
52-
return umf::umf2urResult(e);
53-
} catch (...) {
54-
return UR_RESULT_ERROR_UNKNOWN;
53+
} catch (umf_result_t e) {
54+
return umf::umf2urResult(e);
55+
} catch (...) {
56+
return UR_RESULT_ERROR_UNKNOWN;
5557
}
5658

5759
return UR_RESULT_SUCCESS;

unified-runtime/source/adapters/level_zero/device.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2190,3 +2190,24 @@ void ZeUSMImportExtension::doZeUSMRelease(ze_driver_handle_t DriverHandle,
21902190
void *HostPtr) {
21912191
ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (DriverHandle, HostPtr));
21922192
}
2193+
2194+
std::ostream &operator<<(std::ostream &os,
2195+
ur_device_handle_t_ const &device_handle) {
2196+
if (device_handle.Id.has_value()) {
2197+
return os << device_handle.Id.value();
2198+
}
2199+
return os << "NONE";
2200+
}
2201+
2202+
std::ostream &operator<<(std::ostream &os,
2203+
ur_device_handle_t_::PeerStatus peer_status) {
2204+
switch (peer_status) {
2205+
case ur_device_handle_t_::PeerStatus::DISABLED:
2206+
return os << "DISABLED";
2207+
case ur_device_handle_t_::PeerStatus::ENABLED:
2208+
return os << "ENABLED";
2209+
case ur_device_handle_t_::PeerStatus::NO_CONNECTION:
2210+
return os << "NO_CONNECTION";
2211+
}
2212+
return os << "UNKNOWN";
2213+
}

unified-runtime/source/adapters/level_zero/device.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,17 +254,29 @@ struct ur_device_handle_t_ : ur_object {
254254
std::unordered_map<ur_exp_image_native_handle_t, ze_image_handle_t>
255255
ZeOffsetToImageHandleMap;
256256

257+
// Devices which user enabled p2p access by
258+
// urUsmP2P(Enable|Disable)PeerAccessExp. Devices are indexed by device id.
259+
enum class PeerStatus : char { ENABLED, DISABLED, NO_CONNECTION };
260+
std::vector<PeerStatus>
261+
peers; // info if our device can access given peer device allocations
262+
257263
// unique ephemeral identifer of the device in the adapter
258264
std::optional<DeviceId> Id;
259265

260266
ur::RefCount RefCount;
261267
};
262268

269+
std::ostream &operator<<(std::ostream &os,
270+
ur_device_handle_t_ const &device_handle);
271+
std::ostream &operator<<(std::ostream &os,
272+
ur_device_handle_t_::PeerStatus peer_status);
273+
263274
// Collects a flat vector of unique devices for USM memory pool creation.
264275
// Traverses the input devices and their sub-devices, ensuring each Level Zero
265276
// device handle appears only once in the result.
266277
inline std::vector<ur_device_handle_t> CollectDevicesForUsmPoolCreation(
267278
const std::vector<ur_device_handle_t> &Devices) {
279+
268280
std::vector<ur_device_handle_t> DevicesAndSubDevices;
269281
std::unordered_set<ze_device_handle_t> Seen;
270282

unified-runtime/source/adapters/level_zero/platform.cpp

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -630,9 +630,9 @@ ur_platform_handle_t_::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) {
630630
std::shared_lock<ur_shared_mutex> Lock(URDevicesCacheMutex);
631631
auto it = std::find_if(URDevicesCache.begin(), URDevicesCache.end(),
632632
[&](std::unique_ptr<ur_device_handle_t_> &D) {
633-
return D.get()->ZeDevice == ZeDevice &&
634-
(D.get()->RootDevice == nullptr ||
635-
D.get()->RootDevice->RootDevice == nullptr);
633+
return D->ZeDevice == ZeDevice &&
634+
(D->RootDevice == nullptr ||
635+
D->RootDevice->RootDevice == nullptr);
636636
});
637637
if (it != URDevicesCache.end()) {
638638
return (*it).get();
@@ -785,6 +785,44 @@ ur_result_t ur_platform_handle_t_::populateDeviceCacheIfNeeded() {
785785
dev->Id = id++;
786786
}
787787

788+
for (auto &dev : URDevicesCache) {
789+
dev->peers = std::vector<ur_device_handle_t_::PeerStatus>(
790+
URDevicesCache.size(), ur_device_handle_t_::PeerStatus::NO_CONNECTION);
791+
792+
for (size_t peerId = 0; peerId < URDevicesCache.size(); ++peerId) {
793+
if (peerId == dev->Id.value())
794+
continue;
795+
796+
ZeStruct<ze_device_p2p_properties_t> p2pProperties;
797+
ZE2UR_CALL_THROWS(
798+
zeDeviceGetP2PProperties,
799+
(dev->ZeDevice, URDevicesCache[peerId]->ZeDevice, &p2pProperties));
800+
if (!(p2pProperties.flags & ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS)) {
801+
UR_LOG(INFO,
802+
"p2p access to memory of dev:{} from dev:{} not possible due to "
803+
"lack of p2p property",
804+
peerId, dev->Id.value());
805+
continue;
806+
}
807+
808+
ze_bool_t p2p;
809+
ZE2UR_CALL_THROWS(
810+
zeDeviceCanAccessPeer,
811+
(dev->ZeDevice, URDevicesCache[peerId]->ZeDevice, &p2p));
812+
if (!p2p) {
813+
UR_LOG(INFO,
814+
"p2p access to memory of dev:{} from dev:{} not possible due to "
815+
"no connection",
816+
peerId, dev->Id.value());
817+
continue;
818+
}
819+
820+
UR_LOG(INFO, "p2p access to memory of dev:{} from dev:{} can be enabled",
821+
peerId, dev->Id.value());
822+
dev->peers[peerId] = ur_device_handle_t_::PeerStatus::DISABLED;
823+
}
824+
}
825+
788826
return UR_RESULT_SUCCESS;
789827
}
790828

0 commit comments

Comments
 (0)