|
| 1 | +// Copyright 2024 The IREE Authors |
| 2 | +// |
| 3 | +// Licensed under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | + |
| 7 | +#include <sys/mman.h> |
| 8 | + |
| 9 | +#include <cassert> |
| 10 | +#include <cstring> |
| 11 | +#include <filesystem> |
| 12 | +#include <fstream> |
| 13 | +#include <iostream> |
| 14 | +#include <string> |
| 15 | +#include <vector> |
| 16 | + |
| 17 | +#include "hsa/hsa.h" |
| 18 | +#include "hsa/hsa_ext_amd.h" |
| 19 | + |
| 20 | +namespace { |
| 21 | + |
| 22 | +hsa_status_t get_agent(hsa_agent_t agent, std::vector<hsa_agent_t> *agents, |
| 23 | + hsa_device_type_t requested_dev_type) { |
| 24 | + if (!agents || !(requested_dev_type == HSA_DEVICE_TYPE_AIE || |
| 25 | + requested_dev_type == HSA_DEVICE_TYPE_GPU || |
| 26 | + requested_dev_type == HSA_DEVICE_TYPE_CPU)) { |
| 27 | + return HSA_STATUS_ERROR_INVALID_ARGUMENT; |
| 28 | + } |
| 29 | + |
| 30 | + hsa_device_type_t device_type; |
| 31 | + hsa_status_t ret = |
| 32 | + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); |
| 33 | + |
| 34 | + if (ret != HSA_STATUS_SUCCESS) { |
| 35 | + return ret; |
| 36 | + } |
| 37 | + |
| 38 | + if (device_type == requested_dev_type) { |
| 39 | + agents->push_back(agent); |
| 40 | + } |
| 41 | + |
| 42 | + return ret; |
| 43 | +} |
| 44 | + |
| 45 | +hsa_status_t get_aie_agents(hsa_agent_t agent, void *data) { |
| 46 | + if (!data) { |
| 47 | + return HSA_STATUS_ERROR_INVALID_ARGUMENT; |
| 48 | + } |
| 49 | + |
| 50 | + auto *aie_agents = reinterpret_cast<std::vector<hsa_agent_t> *>(data); |
| 51 | + return get_agent(agent, aie_agents, HSA_DEVICE_TYPE_AIE); |
| 52 | +} |
| 53 | + |
| 54 | +hsa_status_t get_coarse_global_mem_pool(hsa_amd_memory_pool_t pool, void *data, |
| 55 | + bool kernarg) { |
| 56 | + hsa_amd_segment_t segment_type; |
| 57 | + auto ret = hsa_amd_memory_pool_get_info( |
| 58 | + pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type); |
| 59 | + if (ret != HSA_STATUS_SUCCESS) { |
| 60 | + return ret; |
| 61 | + } |
| 62 | + |
| 63 | + if (segment_type == HSA_AMD_SEGMENT_GLOBAL) { |
| 64 | + hsa_amd_memory_pool_global_flag_t global_pool_flags; |
| 65 | + ret = hsa_amd_memory_pool_get_info( |
| 66 | + pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_pool_flags); |
| 67 | + if (ret != HSA_STATUS_SUCCESS) { |
| 68 | + return ret; |
| 69 | + } |
| 70 | + |
| 71 | + if (kernarg) { |
| 72 | + if ((global_pool_flags & |
| 73 | + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) && |
| 74 | + (global_pool_flags & HSA_REGION_GLOBAL_FLAG_KERNARG)) { |
| 75 | + *static_cast<hsa_amd_memory_pool_t *>(data) = pool; |
| 76 | + } |
| 77 | + } else { |
| 78 | + if ((global_pool_flags & |
| 79 | + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) && |
| 80 | + !(global_pool_flags & HSA_REGION_GLOBAL_FLAG_KERNARG)) { |
| 81 | + *static_cast<hsa_amd_memory_pool_t *>(data) = pool; |
| 82 | + } |
| 83 | + } |
| 84 | + } |
| 85 | + |
| 86 | + return HSA_STATUS_SUCCESS; |
| 87 | +} |
| 88 | + |
| 89 | +hsa_status_t get_coarse_global_dev_mem_pool(hsa_amd_memory_pool_t pool, |
| 90 | + void *data) { |
| 91 | + return get_coarse_global_mem_pool(pool, data, false); |
| 92 | +} |
| 93 | + |
| 94 | +hsa_status_t get_coarse_global_kernarg_mem_pool(hsa_amd_memory_pool_t pool, |
| 95 | + void *data) { |
| 96 | + return get_coarse_global_mem_pool(pool, data, true); |
| 97 | +} |
| 98 | + |
| 99 | +void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, |
| 100 | + void **buf) { |
| 101 | + std::ifstream bin_file(file_name, |
| 102 | + std::ios::binary | std::ios::ate | std::ios::in); |
| 103 | + |
| 104 | + assert(bin_file.fail() == false); |
| 105 | + |
| 106 | + auto size(bin_file.tellg()); |
| 107 | + |
| 108 | + bin_file.seekg(0, std::ios::beg); |
| 109 | + auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); |
| 110 | + assert(r == HSA_STATUS_SUCCESS); |
| 111 | + bin_file.read(reinterpret_cast<char *>(*buf), size); |
| 112 | +} |
| 113 | + |
| 114 | +void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, |
| 115 | + void **buf) { |
| 116 | + std::ifstream bin_file(file_name, |
| 117 | + std::ios::binary | std::ios::ate | std::ios::in); |
| 118 | + |
| 119 | + assert(bin_file.fail() == false); |
| 120 | + |
| 121 | + auto size(bin_file.tellg()); |
| 122 | + bin_file.seekg(0, std::ios::beg); |
| 123 | + std::vector<uint32_t> pdi_vec; |
| 124 | + std::string val; |
| 125 | + |
| 126 | + while (bin_file >> val) { |
| 127 | + pdi_vec.push_back(std::stoul(val, nullptr, 16)); |
| 128 | + } |
| 129 | + auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); |
| 130 | + assert(r == HSA_STATUS_SUCCESS); |
| 131 | + std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t)); |
| 132 | +} |
| 133 | + |
| 134 | +} // namespace |
| 135 | + |
| 136 | +int main(int argc, char **argv) { |
| 137 | + std::filesystem::path sourcePath(argv[1]); |
| 138 | + // List of AIE agents in the system. |
| 139 | + std::vector<hsa_agent_t> aie_agents; |
| 140 | + // For creating a queue on an AIE agent. |
| 141 | + hsa_queue_t *aie_queue(nullptr); |
| 142 | + // Memory pool for allocating device-mapped memory. Used for PDI/DPU |
| 143 | + // instructions. |
| 144 | + hsa_amd_memory_pool_t global_dev_mem_pool{0}; |
| 145 | + // System memory pool. Used for allocating kernel argument data. |
| 146 | + hsa_amd_memory_pool_t global_kernarg_mem_pool{0}; |
| 147 | + const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt"); |
| 148 | + const std::string pdi_file_name(sourcePath / "add_one.pdi"); |
| 149 | + uint32_t *dpu_inst_buf(nullptr); |
| 150 | + uint64_t *pdi_buf(nullptr); |
| 151 | + |
| 152 | + assert(aie_agents.empty()); |
| 153 | + assert(global_dev_mem_pool.handle == 0); |
| 154 | + assert(global_kernarg_mem_pool.handle == 0); |
| 155 | + |
| 156 | + // Initialize the runtime. |
| 157 | + auto r = hsa_init(); |
| 158 | + assert(r == HSA_STATUS_SUCCESS); |
| 159 | + |
| 160 | + assert(sizeof(hsa_kernel_dispatch_packet_s) == |
| 161 | + sizeof(hsa_amd_aie_ert_packet_s)); |
| 162 | + |
| 163 | + // Test a launch of an AIE kernel using the HSA API. |
| 164 | + // Find the AIE agents in the system. |
| 165 | + r = hsa_iterate_agents(get_aie_agents, &aie_agents); |
| 166 | + assert(r == HSA_STATUS_SUCCESS); |
| 167 | + // assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) == |
| 168 | + // HSA_STATUS_SUCCESS); |
| 169 | + assert(aie_agents.size() == 1); |
| 170 | + |
| 171 | + const auto &aie_agent = aie_agents.front(); |
| 172 | + |
| 173 | + // Create a queue on the first agent. |
| 174 | + r = hsa_queue_create(aie_agent, 64, HSA_QUEUE_TYPE_SINGLE, nullptr, nullptr, |
| 175 | + 0, 0, &aie_queue); |
| 176 | + assert(r == HSA_STATUS_SUCCESS); |
| 177 | + assert(aie_queue); |
| 178 | + assert(aie_queue->base_address); |
| 179 | + |
| 180 | + // Find a pool for DEV BOs. This is a global system memory pool that is |
| 181 | + // mapped to the device. Will be used for PDIs and DPU instructions. |
| 182 | + r = hsa_amd_agent_iterate_memory_pools( |
| 183 | + aie_agent, get_coarse_global_dev_mem_pool, &global_dev_mem_pool); |
| 184 | + assert(r == HSA_STATUS_SUCCESS); |
| 185 | + |
| 186 | + // Find a pool that supports kernel args. This is just normal system memory. |
| 187 | + // It will be used for commands and input data. |
| 188 | + r = hsa_amd_agent_iterate_memory_pools( |
| 189 | + aie_agent, get_coarse_global_kernarg_mem_pool, &global_kernarg_mem_pool); |
| 190 | + assert(r == HSA_STATUS_SUCCESS); |
| 191 | + assert(global_kernarg_mem_pool.handle); |
| 192 | + |
| 193 | + // Load the DPU and PDI files into a global pool that doesn't support kernel |
| 194 | + // args (DEV BO). |
| 195 | + load_dpu_file(global_dev_mem_pool, dpu_inst_file_name, |
| 196 | + reinterpret_cast<void **>(&dpu_inst_buf)); |
| 197 | + uint32_t dpu_handle = 0; |
| 198 | + r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle); |
| 199 | + assert(r == HSA_STATUS_SUCCESS); |
| 200 | + assert(dpu_handle != 0); |
| 201 | + |
| 202 | + load_pdi_file(global_dev_mem_pool, pdi_file_name, |
| 203 | + reinterpret_cast<void **>(&pdi_buf)); |
| 204 | + uint32_t pdi_handle = 0; |
| 205 | + r = hsa_amd_get_handle_from_vaddr(pdi_buf, &pdi_handle); |
| 206 | + assert(r == HSA_STATUS_SUCCESS); |
| 207 | + assert(pdi_handle != 0); |
| 208 | + |
| 209 | + hsa_amd_aie_ert_hw_ctx_cu_config_t cu_config{.cu_config_bo = pdi_handle, |
| 210 | + .cu_func = 0}; |
| 211 | + |
| 212 | + hsa_amd_aie_ert_hw_ctx_config_cu_param_t config_cu_args{ |
| 213 | + .num_cus = 1, .cu_configs = &cu_config}; |
| 214 | + |
| 215 | + // Configure the queue's hardware context. |
| 216 | + r = hsa_amd_queue_hw_ctx_config( |
| 217 | + aie_queue, HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU, &config_cu_args); |
| 218 | + assert(r == HSA_STATUS_SUCCESS); |
| 219 | + |
| 220 | + // create inputs / outputs |
| 221 | + constexpr std::size_t num_data_elements = 1024; |
| 222 | + constexpr std::size_t data_buffer_size = |
| 223 | + num_data_elements * sizeof(std::uint32_t); |
| 224 | + |
| 225 | + std::uint32_t *input = {}; |
| 226 | + r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0, |
| 227 | + reinterpret_cast<void **>(&input)); |
| 228 | + assert(r == HSA_STATUS_SUCCESS); |
| 229 | + std::uint32_t input_handle = {}; |
| 230 | + r = hsa_amd_get_handle_from_vaddr(input, &input_handle); |
| 231 | + assert(r == HSA_STATUS_SUCCESS); |
| 232 | + assert(input_handle != 0); |
| 233 | + |
| 234 | + std::uint32_t *output = {}; |
| 235 | + r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0, |
| 236 | + reinterpret_cast<void **>(&output)); |
| 237 | + assert(r == HSA_STATUS_SUCCESS); |
| 238 | + std::uint32_t output_handle = {}; |
| 239 | + r = hsa_amd_get_handle_from_vaddr(output, &output_handle); |
| 240 | + assert(r == HSA_STATUS_SUCCESS); |
| 241 | + assert(output_handle != 0); |
| 242 | + |
| 243 | + for (std::size_t i = 0; i < num_data_elements; i++) { |
| 244 | + *(input + i) = i; |
| 245 | + *(output + i) = 0xDEFACE; |
| 246 | + } |
| 247 | + |
| 248 | + ///////////////////////////////////// Creating the cmd packet |
| 249 | + // Creating a packet to store the command |
| 250 | + hsa_amd_aie_ert_packet_t *cmd_pkt = NULL; |
| 251 | + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, |
| 252 | + reinterpret_cast<void **>(&cmd_pkt)); |
| 253 | + assert(r == HSA_STATUS_SUCCESS); |
| 254 | + cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW; |
| 255 | + cmd_pkt->count = 0xA; // # of arguments to put in command |
| 256 | + cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU; |
| 257 | + cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT; |
| 258 | + cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC |
| 259 | + << HSA_PACKET_HEADER_TYPE; |
| 260 | + |
| 261 | + // Creating the payload for the packet |
| 262 | + hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL; |
| 263 | + uint32_t cmd_handle; |
| 264 | + r = hsa_amd_get_handle_from_vaddr(reinterpret_cast<void *>(cmd_pkt), |
| 265 | + &cmd_handle); |
| 266 | + assert(r == HSA_STATUS_SUCCESS); |
| 267 | + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, |
| 268 | + reinterpret_cast<void **>(&cmd_payload)); |
| 269 | + assert(r == HSA_STATUS_SUCCESS); |
| 270 | + cmd_payload->cu_mask = 0x1; // Selecting the PDI to use with this command |
| 271 | + cmd_payload->data[0] = 0x3; // Transaction opcode |
| 272 | + cmd_payload->data[1] = 0x0; |
| 273 | + cmd_payload->data[2] = dpu_handle; |
| 274 | + cmd_payload->data[3] = 0x0; |
| 275 | + cmd_payload->data[4] = 0x44; // Size of DPU instruction |
| 276 | + cmd_payload->data[5] = input_handle; |
| 277 | + cmd_payload->data[6] = 0; |
| 278 | + cmd_payload->data[7] = output_handle; |
| 279 | + cmd_payload->data[8] = 0; |
| 280 | + cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload); |
| 281 | + |
| 282 | + uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1); |
| 283 | + uint64_t packet_id = wr_idx % aie_queue->size; |
| 284 | + reinterpret_cast<hsa_amd_aie_ert_packet_t *>( |
| 285 | + aie_queue->base_address)[packet_id] = *cmd_pkt; |
| 286 | + hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx); |
| 287 | + |
| 288 | + for (std::size_t i = 0; i < num_data_elements; i++) { |
| 289 | + const auto expected = *(input + i) + 1; |
| 290 | + const auto result = *(output + i); |
| 291 | + assert(result == expected); |
| 292 | + } |
| 293 | + |
| 294 | + r = hsa_queue_destroy(aie_queue); |
| 295 | + assert(r == HSA_STATUS_SUCCESS); |
| 296 | + |
| 297 | + r = hsa_amd_memory_pool_free(output); |
| 298 | + assert(r == HSA_STATUS_SUCCESS); |
| 299 | + r = hsa_amd_memory_pool_free(input); |
| 300 | + assert(r == HSA_STATUS_SUCCESS); |
| 301 | + r = hsa_amd_memory_pool_free(pdi_buf); |
| 302 | + assert(r == HSA_STATUS_SUCCESS); |
| 303 | + r = hsa_amd_memory_pool_free(dpu_inst_buf); |
| 304 | + assert(r == HSA_STATUS_SUCCESS); |
| 305 | + |
| 306 | + r = hsa_shut_down(); |
| 307 | + assert(r == HSA_STATUS_SUCCESS); |
| 308 | + std::cout << "PASS\n"; |
| 309 | +} |
0 commit comments