Skip to content
This repository was archived by the owner on Dec 24, 2024. It is now read-only.

Commit 68669f4

Browse files
committed
rocr: Generalize AMD::MemoryRegion Allocate and Free
Remove KFD-specific Allocate/Free calls from the AMD::MemoryRegion. The KFD-driver-specific Allocate/Free calls are now implemented in the KfdDriver. Future changes will migrate the remaining KFD-specific calls out of AMD::MemoryRegion. This allows the MemoryRegion to be used across AMD drivers like the XDNA driver. Change-Id: Ib6a2a9e5e1a15e61644d2592beb3a8e6578c3010
1 parent c42ff44 commit 68669f4

11 files changed

+448
-326
lines changed

runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp

+188-7
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@
4949

5050
#include "hsakmt/hsakmt.h"
5151

52+
#include "core/inc/amd_cpu_agent.h"
53+
#include "core/inc/amd_gpu_agent.h"
54+
#include "core/inc/amd_memory_region.h"
55+
#include "core/inc/exceptions.h"
5256
#include "core/inc/runtime.h"
5357

5458
namespace rocr {
@@ -70,18 +74,155 @@ hsa_status_t KfdDriver::QueryKernelModeDriver(core::DriverQuery query) {
7074
return HSA_STATUS_SUCCESS;
7175
}
7276

73-
hsa_status_t KfdDriver::GetMemoryProperties(uint32_t node_id,
74-
core::MemProperties &mprops) const {
77+
hsa_status_t
78+
KfdDriver::GetMemoryProperties(uint32_t node_id,
79+
core::MemoryRegion &mem_region) const {
7580
return HSA_STATUS_SUCCESS;
7681
}
7782

78-
hsa_status_t KfdDriver::AllocateMemory(void **mem, size_t size,
79-
uint32_t node_id, core::MemFlags flags) {
80-
return HSA_STATUS_SUCCESS;
83+
hsa_status_t
84+
KfdDriver::AllocateMemory(const core::MemoryRegion &mem_region,
85+
core::MemoryRegion::AllocateFlags alloc_flags,
86+
void **mem, size_t size, uint32_t agent_node_id) {
87+
const MemoryRegion &m_region(static_cast<const MemoryRegion &>(mem_region));
88+
HsaMemFlags kmt_alloc_flags(m_region.mem_flags());
89+
90+
kmt_alloc_flags.ui32.ExecuteAccess =
91+
(alloc_flags & core::MemoryRegion::AllocateExecutable ? 1 : 0);
92+
kmt_alloc_flags.ui32.AQLQueueMemory =
93+
(alloc_flags & core::MemoryRegion::AllocateDoubleMap ? 1 : 0);
94+
95+
if (m_region.IsSystem() &&
96+
(alloc_flags & core::MemoryRegion::AllocateNonPaged)) {
97+
kmt_alloc_flags.ui32.NonPaged = 1;
98+
}
99+
100+
// Allocating a memory handle for virtual memory
101+
kmt_alloc_flags.ui32.NoAddress =
102+
!!(alloc_flags & core::MemoryRegion::AllocateMemoryOnly);
103+
104+
// Allocate pseudo fine grain memory
105+
kmt_alloc_flags.ui32.CoarseGrain =
106+
(alloc_flags & core::MemoryRegion::AllocatePCIeRW
107+
? 0
108+
: kmt_alloc_flags.ui32.CoarseGrain);
109+
110+
kmt_alloc_flags.ui32.NoSubstitute =
111+
(alloc_flags & core::MemoryRegion::AllocatePinned
112+
? 1
113+
: kmt_alloc_flags.ui32.NoSubstitute);
114+
115+
kmt_alloc_flags.ui32.GTTAccess =
116+
(alloc_flags & core::MemoryRegion::AllocateGTTAccess
117+
? 1
118+
: kmt_alloc_flags.ui32.GTTAccess);
119+
120+
if (m_region.IsLocalMemory()) {
121+
// Allocate physically contiguous memory. AllocateKfdMemory function call
122+
// will fail if this flag is not supported in KFD.
123+
kmt_alloc_flags.ui32.Contiguous =
124+
(alloc_flags & core::MemoryRegion::AllocateContiguous
125+
? 1
126+
: kmt_alloc_flags.ui32.Contiguous);
127+
}
128+
129+
//// Only allow using the suballocator for ordinary VRAM.
130+
if (m_region.IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
131+
bool subAllocEnabled =
132+
!core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
133+
// Avoid modifying executable or queue allocations.
134+
bool useSubAlloc = subAllocEnabled;
135+
useSubAlloc &=
136+
((alloc_flags & (~core::MemoryRegion::AllocateRestrict)) == 0);
137+
138+
if (useSubAlloc) {
139+
*mem = m_region.fragment_alloc(size);
140+
141+
if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
142+
hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) {
143+
m_region.fragment_free(*mem);
144+
*mem = nullptr;
145+
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
146+
}
147+
148+
return HSA_STATUS_SUCCESS;
149+
}
150+
}
151+
152+
const uint32_t node_id =
153+
(alloc_flags & core::MemoryRegion::AllocateGTTAccess)
154+
? agent_node_id
155+
: m_region.owner()->node_id();
156+
157+
//// Allocate memory.
158+
//// If it fails attempt to release memory from the block allocator and retry.
159+
*mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
160+
if (*mem == nullptr) {
161+
m_region.owner()->Trim();
162+
*mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
163+
}
164+
165+
if (*mem != nullptr) {
166+
if (kmt_alloc_flags.ui32.NoAddress)
167+
return HSA_STATUS_SUCCESS;
168+
169+
// Commit the memory.
170+
// For system memory, on non-restricted allocation, map it to all GPUs. On
171+
// restricted allocation, only CPU is allowed to access by default, so
172+
// no need to map
173+
// For local memory, only map it to the owning GPU. Mapping to other GPU,
174+
// if the access is allowed, is performed on AllowAccess.
175+
HsaMemMapFlags map_flag = m_region.map_flags();
176+
size_t map_node_count = 1;
177+
const uint32_t owner_node_id = m_region.owner()->node_id();
178+
const uint32_t *map_node_id = &owner_node_id;
179+
180+
if (m_region.IsSystem()) {
181+
if ((alloc_flags & core::MemoryRegion::AllocateRestrict) == 0) {
182+
// Map to all GPU agents.
183+
map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size();
184+
185+
if (map_node_count == 0) {
186+
// No need to pin since no GPU in the platform.
187+
return HSA_STATUS_SUCCESS;
188+
}
189+
190+
map_node_id = &core::Runtime::runtime_singleton_->gpu_ids()[0];
191+
} else {
192+
// No need to pin it for CPU exclusive access.
193+
return HSA_STATUS_SUCCESS;
194+
}
195+
}
196+
197+
uint64_t alternate_va = 0;
198+
const bool is_resident = MakeKfdMemoryResident(
199+
map_node_count, map_node_id, *mem, size, &alternate_va, map_flag);
200+
201+
const bool require_pinning =
202+
(!m_region.full_profile() || m_region.IsLocalMemory() ||
203+
m_region.IsScratch());
204+
205+
if (require_pinning && !is_resident) {
206+
FreeKfdMemory(*mem, size);
207+
*mem = nullptr;
208+
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
209+
}
210+
211+
if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
212+
hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) {
213+
FreeKfdMemory(*mem, size);
214+
*mem = nullptr;
215+
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
216+
}
217+
return HSA_STATUS_SUCCESS;
218+
}
219+
220+
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
81221
}
82222

83-
hsa_status_t KfdDriver::FreeMemory(void *mem, uint32_t node_id) {
84-
return HSA_STATUS_SUCCESS;
223+
hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
224+
MakeKfdMemoryUnresident(mem);
225+
return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
85226
}
86227

87228
hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) {
@@ -92,5 +233,45 @@ hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const {
92233
return HSA_STATUS_SUCCESS;
93234
}
94235

236+
void *KfdDriver::AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id,
237+
size_t size) {
238+
void *mem = nullptr;
239+
const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flags, &mem);
240+
return (status == HSAKMT_STATUS_SUCCESS) ? mem : nullptr;
241+
}
242+
243+
bool KfdDriver::FreeKfdMemory(void *mem, size_t size) {
244+
if (mem == nullptr || size == 0) {
245+
debug_print("Invalid free ptr:%p size:%lu\n", mem, size);
246+
return true;
247+
}
248+
249+
if (hsaKmtFreeMemory(mem, size) != HSAKMT_STATUS_SUCCESS) {
250+
debug_print("Failed to free ptr:%p size:%lu\n", mem, size);
251+
return false;
252+
}
253+
return true;
254+
}
255+
256+
bool KfdDriver::MakeKfdMemoryResident(size_t num_node, const uint32_t *nodes,
257+
const void *mem, size_t size,
258+
uint64_t *alternate_va,
259+
HsaMemMapFlags map_flag) {
260+
assert(num_node > 0);
261+
assert(nodes);
262+
263+
*alternate_va = 0;
264+
265+
HSAKMT_STATUS kmt_status(hsaKmtMapMemoryToGPUNodes(
266+
const_cast<void *>(mem), size, alternate_va, map_flag, num_node,
267+
const_cast<uint32_t *>(nodes)));
268+
269+
return (kmt_status == HSAKMT_STATUS_SUCCESS);
270+
}
271+
272+
void KfdDriver::MakeKfdMemoryUnresident(const void *mem) {
273+
hsaKmtUnmapMemoryToGPU(const_cast<void *>(mem));
274+
}
275+
95276
} // namespace AMD
96277
} // namespace rocr

runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp

+7-5
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#include <memory>
4848
#include <string>
4949

50+
#include "core/inc/amd_memory_region.h"
5051
#include "core/inc/runtime.h"
5152
#include "uapi/amdxdna_accel.h"
5253

@@ -89,17 +90,18 @@ hsa_status_t XdnaDriver::QueryKernelModeDriver(core::DriverQuery query) {
8990

9091
hsa_status_t
9192
XdnaDriver::GetMemoryProperties(uint32_t node_id,
92-
core::MemProperties &mprops) const {
93+
core::MemoryRegion &mem_region) const {
9394
return HSA_STATUS_SUCCESS;
9495
}
9596

96-
hsa_status_t XdnaDriver::AllocateMemory(void **mem, size_t size,
97-
uint32_t node_id,
98-
core::MemFlags flags) {
97+
hsa_status_t
98+
XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
99+
core::MemoryRegion::AllocateFlags alloc_flags,
100+
void **mem, size_t size, uint32_t node_id) {
99101
return HSA_STATUS_SUCCESS;
100102
}
101103

102-
hsa_status_t XdnaDriver::FreeMemory(void *mem, uint32_t node_id) {
104+
hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
103105
return HSA_STATUS_SUCCESS;
104106
}
105107

runtime/hsa-runtime/core/inc/agent.h

+12-10
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,12 @@
4949
#include <vector>
5050

5151
#include "core/inc/checked.h"
52+
#include "core/inc/driver.h"
5253
#include "core/inc/isa.h"
53-
#include "core/inc/queue.h"
5454
#include "core/inc/memory_region.h"
55-
#include "core/util/utils.h"
55+
#include "core/inc/queue.h"
5656
#include "core/util/locks.h"
57+
#include "core/util/utils.h"
5758

5859
namespace rocr {
5960

@@ -117,19 +118,18 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
117118
// @brief Agent class contructor.
118119
//
119120
// @param [in] type CPU or GPU or other.
120-
explicit Agent(uint32_t node_id, DeviceType type)
121-
: node_id_(node_id),
122-
device_type_(uint32_t(type)),
123-
profiling_enabled_(false),
124-
enabled_(false) {
121+
explicit Agent(DriverType drv_type, uint32_t node_id, DeviceType type)
122+
: driver_type(drv_type), node_id_(node_id), device_type_(uint32_t(type)),
123+
profiling_enabled_(false), enabled_(false) {
125124
public_handle_ = Convert(this);
126125
}
127126

128127
// @brief Agent class contructor.
129128
//
130129
// @param [in] type CPU or GPU or other.
131-
explicit Agent(uint32_t node_id, uint32_t type)
132-
: node_id_(node_id), device_type_(type), profiling_enabled_(false) {
130+
explicit Agent(DriverType drv_type, uint32_t node_id, uint32_t type)
131+
: driver_type(drv_type), node_id_(node_id), device_type_(type),
132+
profiling_enabled_(false) {
133133
public_handle_ = Convert(this);
134134
}
135135

@@ -315,7 +315,9 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
315315
for (auto region : regions()) region->Trim();
316316
}
317317

318-
protected:
318+
const DriverType driver_type;
319+
320+
protected:
319321
// Intention here is to have a polymorphic update procedure for public_handle_
320322
// which is callable on any Agent* but only from some class dervied from
321323
// Agent*. do_set_public_handle should remain protected or private in all

0 commit comments

Comments
 (0)