Skip to content

[Offload] Add an unloadBinary interface to PluginInterface #143873

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 25, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion offload/liboffload/API/Program.td
Original file line number Diff line number Diff line change
@@ -13,7 +13,9 @@
def : Function {
let name = "olCreateProgram";
let desc = "Create a program for the device from the binary image pointed to by `ProgData`.";
let details = [];
let details = [
"The provided `ProgData` will be copied and need not outlive the returned handle",
];
let params = [
Param<"ol_device_handle_t", "Device", "handle of the device", PARAM_IN>,
Param<"const void*", "ProgData", "pointer to the program binary data", PARAM_IN>,
8 changes: 8 additions & 0 deletions offload/liboffload/src/OffloadImpl.cpp
Original file line number Diff line number Diff line change
@@ -458,6 +458,14 @@ Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
}

Error olDestroyProgram_impl(ol_program_handle_t Program) {
auto &Device = Program->Image->getDevice();
if (auto Err = Device.unloadBinary(Program->Image))
return Err;

auto &LoadedImages = Device.LoadedImages;
LoadedImages.erase(
std::find(LoadedImages.begin(), LoadedImages.end(), Program->Image));

return olDestroy(Program);
}

20 changes: 7 additions & 13 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
@@ -2023,6 +2023,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}

Error unloadBinaryImpl(DeviceImageTy *Image) override {
AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(*Image);

// Unload the executable of the image.
return AMDImage.unloadExecutable();
}

/// Deinitialize the device and release its resources.
Error deinitImpl() override {
// Deinitialize the stream and event pools.
@@ -2035,19 +2042,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = AMDGPUSignalManager.deinit())
return Err;

// Close modules if necessary.
if (!LoadedImages.empty()) {
// Each image has its own module.
for (DeviceImageTy *Image : LoadedImages) {
AMDGPUDeviceImageTy &AMDImage =
static_cast<AMDGPUDeviceImageTy &>(*Image);

// Unload the executable of the image.
if (auto Err = AMDImage.unloadExecutable())
return Err;
}
}

// Invalidate agent reference.
Agent = {0};

12 changes: 8 additions & 4 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
@@ -735,6 +735,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual Expected<DeviceImageTy *>
loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;

/// Unload a previously loaded Image from the device
Error unloadBinary(DeviceImageTy *Image);
virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;

/// Setup the device environment if needed. Notice this setup may not be run
/// on some plugins. By default, it will be executed, but plugins can change
/// this behavior by overriding the shouldSetupDeviceEnvironment function.
@@ -1019,6 +1023,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
BoolEnvar OMPX_TrackAllocationTraces =
BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);

/// Array of images loaded into the device. Images are automatically
/// deallocated by the allocator.
llvm::SmallVector<DeviceImageTy *> LoadedImages;

private:
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
@@ -1069,10 +1077,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
UInt32Envar OMPX_InitialNumStreams;
UInt32Envar OMPX_InitialNumEvents;

/// Array of images loaded into the device. Images are automatically
/// deallocated by the allocator.
llvm::SmallVector<DeviceImageTy *> LoadedImages;

/// The identifier of the device within the plugin. Notice this is not a
/// global device id and is not the device id visible to the OpenMP user.
const int32_t DeviceId;
72 changes: 38 additions & 34 deletions offload/plugins-nextgen/common/src/PluginInterface.cpp
Original file line number Diff line number Diff line change
@@ -821,26 +821,49 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
return Plugin::success();
}

Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
for (DeviceImageTy *Image : LoadedImages)
if (auto Err = callGlobalDestructors(Plugin, *Image))
return Err;
Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
if (auto Err = callGlobalDestructors(Plugin, *Image))
return Err;

if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
for (auto *Image : LoadedImages) {
DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
sizeof(DeviceMemoryPoolTrackingTy),
&ImageDeviceMemoryPoolTracking);
if (auto Err =
GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
consumeError(std::move(Err));
continue;
}
DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
sizeof(DeviceMemoryPoolTrackingTy),
&ImageDeviceMemoryPoolTracking);
if (auto Err =
GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
consumeError(std::move(Err));
}
DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
}

GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
if (!ProfOrErr)
return ProfOrErr.takeError();

if (!ProfOrErr->empty()) {
// Dump out profdata
if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
uint32_t(DeviceDebugKind::PGODump))
ProfOrErr->dump();

// Write data to profiling file
if (auto Err = ProfOrErr->write())
return Err;
}

return unloadBinaryImpl(Image);
}

Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
for (auto &I : LoadedImages)
if (auto Err = unloadBinary(I))
return Err;
LoadedImages.clear();

if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
// TODO: Write this by default into a file.
printf("\n\n|-----------------------\n"
"| Device memory tracker:\n"
@@ -856,25 +879,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
DeviceMemoryPoolTracking.AllocationMax);
}

for (auto *Image : LoadedImages) {
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
if (!ProfOrErr)
return ProfOrErr.takeError();

if (ProfOrErr->empty())
continue;

// Dump out profdata
if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
uint32_t(DeviceDebugKind::PGODump))
ProfOrErr->dump();

// Write data to profiling file
if (auto Err = ProfOrErr->write())
return Err;
}

// Delete the memory manager before deinitializing the device. Otherwise,
// we may delete device allocations after the device is deinitialized.
if (MemoryManager)
27 changes: 13 additions & 14 deletions offload/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
@@ -358,6 +358,19 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::success();
}

Error unloadBinaryImpl(DeviceImageTy *Image) override {
assert(Context && "Invalid CUDA context");

// Each image has its own module.
CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);

// Unload the module of the image.
if (auto Err = CUDAImage.unloadModule())
return Err;

return Plugin::success();
}

/// Deinitialize the device and release its resources.
Error deinitImpl() override {
if (Context) {
@@ -372,20 +385,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (auto Err = CUDAEventManager.deinit())
return Err;

// Close modules if necessary.
if (!LoadedImages.empty()) {
assert(Context && "Invalid CUDA context");

// Each image has its own module.
for (DeviceImageTy *Image : LoadedImages) {
CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);

// Unload the module of the image.
if (auto Err = CUDAImage.unloadModule())
return Err;
}
}

if (Context) {
CUresult Res = cuDevicePrimaryCtxRelease(Device);
if (auto Err =