49
49
50
50
#include " hsakmt/hsakmt.h"
51
51
52
+ #include " core/inc/amd_cpu_agent.h"
53
+ #include " core/inc/amd_gpu_agent.h"
54
+ #include " core/inc/amd_memory_region.h"
55
+ #include " core/inc/exceptions.h"
52
56
#include " core/inc/runtime.h"
53
57
54
58
namespace rocr {
@@ -70,18 +74,155 @@ hsa_status_t KfdDriver::QueryKernelModeDriver(core::DriverQuery query) {
70
74
return HSA_STATUS_SUCCESS;
71
75
}
72
76
73
- hsa_status_t KfdDriver::GetMemoryProperties (uint32_t node_id,
74
- core::MemProperties &mprops) const {
77
+ hsa_status_t
78
+ KfdDriver::GetMemoryProperties (uint32_t node_id,
79
+ core::MemoryRegion &mem_region) const {
75
80
return HSA_STATUS_SUCCESS;
76
81
}
77
82
78
- hsa_status_t KfdDriver::AllocateMemory (void **mem, size_t size,
79
- uint32_t node_id, core::MemFlags flags) {
80
- return HSA_STATUS_SUCCESS;
83
+ hsa_status_t
84
+ KfdDriver::AllocateMemory (const core::MemoryRegion &mem_region,
85
+ core::MemoryRegion::AllocateFlags alloc_flags,
86
+ void **mem, size_t size, uint32_t agent_node_id) {
87
+ const MemoryRegion &m_region (static_cast <const MemoryRegion &>(mem_region));
88
+ HsaMemFlags kmt_alloc_flags (m_region.mem_flags ());
89
+
90
+ kmt_alloc_flags.ui32 .ExecuteAccess =
91
+ (alloc_flags & core::MemoryRegion::AllocateExecutable ? 1 : 0 );
92
+ kmt_alloc_flags.ui32 .AQLQueueMemory =
93
+ (alloc_flags & core::MemoryRegion::AllocateDoubleMap ? 1 : 0 );
94
+
95
+ if (m_region.IsSystem () &&
96
+ (alloc_flags & core::MemoryRegion::AllocateNonPaged)) {
97
+ kmt_alloc_flags.ui32 .NonPaged = 1 ;
98
+ }
99
+
100
+ // Allocating a memory handle for virtual memory
101
+ kmt_alloc_flags.ui32 .NoAddress =
102
+ !!(alloc_flags & core::MemoryRegion::AllocateMemoryOnly);
103
+
104
+ // Allocate pseudo fine grain memory
105
+ kmt_alloc_flags.ui32 .CoarseGrain =
106
+ (alloc_flags & core::MemoryRegion::AllocatePCIeRW
107
+ ? 0
108
+ : kmt_alloc_flags.ui32 .CoarseGrain );
109
+
110
+ kmt_alloc_flags.ui32 .NoSubstitute =
111
+ (alloc_flags & core::MemoryRegion::AllocatePinned
112
+ ? 1
113
+ : kmt_alloc_flags.ui32 .NoSubstitute );
114
+
115
+ kmt_alloc_flags.ui32 .GTTAccess =
116
+ (alloc_flags & core::MemoryRegion::AllocateGTTAccess
117
+ ? 1
118
+ : kmt_alloc_flags.ui32 .GTTAccess );
119
+
120
+ if (m_region.IsLocalMemory ()) {
121
+ // Allocate physically contiguous memory. AllocateKfdMemory function call
122
+ // will fail if this flag is not supported in KFD.
123
+ kmt_alloc_flags.ui32 .Contiguous =
124
+ (alloc_flags & core::MemoryRegion::AllocateContiguous
125
+ ? 1
126
+ : kmt_alloc_flags.ui32 .Contiguous );
127
+ }
128
+
129
+ // // Only allow using the suballocator for ordinary VRAM.
130
+ if (m_region.IsLocalMemory () && !kmt_alloc_flags.ui32 .NoAddress ) {
131
+ bool subAllocEnabled =
132
+ !core::Runtime::runtime_singleton_->flag ().disable_fragment_alloc ();
133
+ // Avoid modifying executable or queue allocations.
134
+ bool useSubAlloc = subAllocEnabled;
135
+ useSubAlloc &=
136
+ ((alloc_flags & (~core::MemoryRegion::AllocateRestrict)) == 0 );
137
+
138
+ if (useSubAlloc) {
139
+ *mem = m_region.fragment_alloc (size);
140
+
141
+ if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
142
+ hsaKmtReplaceAsanHeaderPage (*mem) != HSAKMT_STATUS_SUCCESS) {
143
+ m_region.fragment_free (*mem);
144
+ *mem = nullptr ;
145
+ return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
146
+ }
147
+
148
+ return HSA_STATUS_SUCCESS;
149
+ }
150
+ }
151
+
152
+ const uint32_t node_id =
153
+ (alloc_flags & core::MemoryRegion::AllocateGTTAccess)
154
+ ? agent_node_id
155
+ : m_region.owner ()->node_id ();
156
+
157
+ // // Allocate memory.
158
+ // // If it fails attempt to release memory from the block allocator and retry.
159
+ *mem = AllocateKfdMemory (kmt_alloc_flags, node_id, size);
160
+ if (*mem == nullptr ) {
161
+ m_region.owner ()->Trim ();
162
+ *mem = AllocateKfdMemory (kmt_alloc_flags, node_id, size);
163
+ }
164
+
165
+ if (*mem != nullptr ) {
166
+ if (kmt_alloc_flags.ui32 .NoAddress )
167
+ return HSA_STATUS_SUCCESS;
168
+
169
+ // Commit the memory.
170
+ // For system memory, on non-restricted allocation, map it to all GPUs. On
171
+ // restricted allocation, only CPU is allowed to access by default, so
172
+ // no need to map
173
+ // For local memory, only map it to the owning GPU. Mapping to other GPU,
174
+ // if the access is allowed, is performed on AllowAccess.
175
+ HsaMemMapFlags map_flag = m_region.map_flags ();
176
+ size_t map_node_count = 1 ;
177
+ const uint32_t owner_node_id = m_region.owner ()->node_id ();
178
+ const uint32_t *map_node_id = &owner_node_id;
179
+
180
+ if (m_region.IsSystem ()) {
181
+ if ((alloc_flags & core::MemoryRegion::AllocateRestrict) == 0 ) {
182
+ // Map to all GPU agents.
183
+ map_node_count = core::Runtime::runtime_singleton_->gpu_ids ().size ();
184
+
185
+ if (map_node_count == 0 ) {
186
+ // No need to pin since no GPU in the platform.
187
+ return HSA_STATUS_SUCCESS;
188
+ }
189
+
190
+ map_node_id = &core::Runtime::runtime_singleton_->gpu_ids ()[0 ];
191
+ } else {
192
+ // No need to pin it for CPU exclusive access.
193
+ return HSA_STATUS_SUCCESS;
194
+ }
195
+ }
196
+
197
+ uint64_t alternate_va = 0 ;
198
+ const bool is_resident = MakeKfdMemoryResident (
199
+ map_node_count, map_node_id, *mem, size, &alternate_va, map_flag);
200
+
201
+ const bool require_pinning =
202
+ (!m_region.full_profile () || m_region.IsLocalMemory () ||
203
+ m_region.IsScratch ());
204
+
205
+ if (require_pinning && !is_resident) {
206
+ FreeKfdMemory (*mem, size);
207
+ *mem = nullptr ;
208
+ return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
209
+ }
210
+
211
+ if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
212
+ hsaKmtReplaceAsanHeaderPage (*mem) != HSAKMT_STATUS_SUCCESS) {
213
+ FreeKfdMemory (*mem, size);
214
+ *mem = nullptr ;
215
+ return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
216
+ }
217
+ return HSA_STATUS_SUCCESS;
218
+ }
219
+
220
+ return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
81
221
}
82
222
83
- hsa_status_t KfdDriver::FreeMemory (void *mem, uint32_t node_id) {
84
- return HSA_STATUS_SUCCESS;
223
+ hsa_status_t KfdDriver::FreeMemory (void *mem, size_t size) {
224
+ MakeKfdMemoryUnresident (mem);
225
+ return FreeKfdMemory (mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
85
226
}
86
227
87
228
hsa_status_t KfdDriver::CreateQueue (core::Queue &queue) {
@@ -92,5 +233,45 @@ hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const {
92
233
return HSA_STATUS_SUCCESS;
93
234
}
94
235
236
+ void *KfdDriver::AllocateKfdMemory (const HsaMemFlags &flags, uint32_t node_id,
237
+ size_t size) {
238
+ void *mem = nullptr ;
239
+ const HSAKMT_STATUS status = hsaKmtAllocMemory (node_id, size, flags, &mem);
240
+ return (status == HSAKMT_STATUS_SUCCESS) ? mem : nullptr ;
241
+ }
242
+
243
+ bool KfdDriver::FreeKfdMemory (void *mem, size_t size) {
244
+ if (mem == nullptr || size == 0 ) {
245
+ debug_print (" Invalid free ptr:%p size:%lu\n " , mem, size);
246
+ return true ;
247
+ }
248
+
249
+ if (hsaKmtFreeMemory (mem, size) != HSAKMT_STATUS_SUCCESS) {
250
+ debug_print (" Failed to free ptr:%p size:%lu\n " , mem, size);
251
+ return false ;
252
+ }
253
+ return true ;
254
+ }
255
+
256
+ bool KfdDriver::MakeKfdMemoryResident (size_t num_node, const uint32_t *nodes,
257
+ const void *mem, size_t size,
258
+ uint64_t *alternate_va,
259
+ HsaMemMapFlags map_flag) {
260
+ assert (num_node > 0 );
261
+ assert (nodes);
262
+
263
+ *alternate_va = 0 ;
264
+
265
+ HSAKMT_STATUS kmt_status (hsaKmtMapMemoryToGPUNodes (
266
+ const_cast <void *>(mem), size, alternate_va, map_flag, num_node,
267
+ const_cast <uint32_t *>(nodes)));
268
+
269
+ return (kmt_status == HSAKMT_STATUS_SUCCESS);
270
+ }
271
+
272
+ void KfdDriver::MakeKfdMemoryUnresident (const void *mem) {
273
+ hsaKmtUnmapMemoryToGPU (const_cast <void *>(mem));
274
+ }
275
+
95
276
} // namespace AMD
96
277
} // namespace rocr
0 commit comments