nod-ai · Aug 30, 2024
diff --git a/‎.github/workflows/ci-linux.yml
+203 b/‎.github/workflows/ci-linux.yml
+203
diff --git a/‎rocrtst/suites/aie/CMakeLists.txt
+8 b/‎rocrtst/suites/aie/CMakeLists.txt
+8
diff --git a/‎rocrtst/suites/aie/add_one.pdi
3.47 KB b/‎rocrtst/suites/aie/add_one.pdi
3.47 KB
diff --git a/‎rocrtst/suites/aie/add_one_insts.txt
+68 b/‎rocrtst/suites/aie/add_one_insts.txt
+68
diff --git a/‎rocrtst/suites/aie/aie_hsa_bare_add_one.cc
+485 b/‎rocrtst/suites/aie/aie_hsa_bare_add_one.cc
+485
diff --git a/‎rocrtst/suites/aie/aie_hsa_dispatch_test.cc
+309 b/‎rocrtst/suites/aie/aie_hsa_dispatch_test.cc
+309
diff --git a/‎rocrtst/suites/aie/amdxdna_accel.h
+569 b/‎rocrtst/suites/aie/amdxdna_accel.h
+569
diff --git a/‎rocrtst/suites/aie/hsa_ipu.h
+271 b/‎rocrtst/suites/aie/hsa_ipu.h
+271
@@ -0,0 +1,203 @@
+name: CI Linux
+
+on:
+  workflow_call:
+  workflow_dispatch:
+  pull_request:
+  merge_group:
+  push:
+    branches:
+      - main
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit).
+  group: ci-build-test-cpp-linux-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    name: Build (linux)
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on:
+          - ubuntu-22.04
+#          - nod-ai-shared-cpubuilder-manylinux-x86_64
+    runs-on: ${{ matrix.runs-on }}
+    env:
+      CACHE_DIR: ${{ github.workspace }}/.container-cache
+      CACHE_KEY: linux-build-manylinux-v1-${{ github.event.number || format('{0}-{1}', github.ref_name, github.run_number) }}
+    steps:
+      - name: Install tmate
+        if: ${{ matrix.runs-on == 'nod-ai-shared-cpubuilder-manylinux-x86_64' }}
+        run: dnf install -y epel-release && dnf install -y tmate
+
+      - name: Set unified TZ
+        uses: szenius/set-timezone@v2.0
+        with:
+          # this is an arbitrary choice
+          timezoneLinux: "Asia/Singapore"
+          timezoneMacos: "Asia/Singapore"
+          timezoneWindows: "Singapore Standard Time"
+
+      - name: Configure local git mirrors
+        if: ${{ matrix.runs-on == 'nod-ai-shared-cpubuilder-manylinux-x86_64' }}
+        run: |
+          /gitmirror/scripts/trigger_update_mirrors.sh
+          /gitmirror/scripts/git_config.sh
+
+      - name: Install deps
+        if: ${{ matrix.runs-on == 'nod-ai-shared-cpubuilder-manylinux-x86_64' }}
+        run: |
+          dnf install -y almalinux-release-devel
+          yum install -y elfutils-libelf-devel p7zip p7zip-plugins
+
+      - name: Install deps
+        if: ${{ matrix.runs-on == 'ubuntu-22.04' }}
+        run: |
+          sudo apt install -y libelf-dev libnuma-dev libdrm-dev
+
+      - name: Setup Cpp
+        if: ${{ matrix.runs-on == 'ubuntu-22.04' }}
+        uses: aminya/setup-cpp@v1
+        with:
+          compiler: llvm-18
+          cmake: true
+          ninja: true
+          ccache: true
+
+      - name: "Checking out repository"
+        uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
+        with:
+          submodules: recursive
+
+      - name: Enable cache
+        uses: actions/cache/restore@v3
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key:  ${{ env.CACHE_KEY }}
+          restore-keys: linux-build-
+
+      - name: Configure ccache
+        run: |
+          export cache_dir="${{ env.CACHE_DIR }}"
+          if [ -z "${cache_dir}" ]; then
+            cache_dir="${{ github.workspace }}/.build-cache"
+            mkdir -p "${cache_dir}"
+            cache_dir="$(cd ${cache_dir} && pwd)"
+          fi
+          echo "Caching to ${cache_dir}"
+          mkdir -p "${cache_dir}/ccache"
+          export CCACHE_DIR="${cache_dir}/ccache" >> $GITHUB_ENV
+          export CCACHE_MAXSIZE="700M" >> $GITHUB_ENV
+          export CMAKE_C_COMPILER_LAUNCHER=ccache >> $GITHUB_ENV
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache >> $GITHUB_ENV
+          export CCACHE_COMPILERCHECK="string:$(clang --version)" >> $GITHUB_ENV
+
+      - name: Build and install libnuma
+        if: ${{ matrix.runs-on == 'nod-ai-shared-cpubuilder-manylinux-x86_64' }}
+        run: |
+          curl --silent -L \
+            https://github.com/numactl/numactl/releases/download/v2.0.18/numactl-2.0.18.tar.gz \
+            -o numactl-2.0.18.tar.gz
+          tar -xf numactl-2.0.18.tar.gz
+          pushd numactl-2.0.18
+          ./configure
+          make install
+          popd
+
+      - name: Hack ROCR
+        run: |
+          sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/core/runtime/blit_shaders/CMakeLists.txt
+          sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/core/runtime/trap_handler/CMakeLists.txt
+          sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/image/blit_src/CMakeLists.txt
+
+      - name: Build ROCR distro
+        run: |
+          rocr_dir="$(cd ${{ github.workspace }} && pwd)"
+          build_rocr_dir="${{ github.workspace }}/rocr-build"
+          mkdir -p "$build_rocr_dir"
+          build_rocr_dir="$(cd $build_rocr_dir && pwd)"
+          rocr_install_dir="${{ github.workspace }}/rocr-install"
+          
+          cmake -GNinja \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_INSTALL_PREFIX="$rocr_install_dir" \
+            -DClang_DIR=/usr/lib/llvm-18/lib/cmake/clang \
+            -DLLVM_DIR=/usr/lib/llvm-18/lib/cmake/llvm \
+            -DIMAGE_SUPPORT=OFF \
+            -S "$rocr_dir" -B "$build_rocr_dir"
+          cmake --build "$build_rocr_dir" --target install
+
+      - name: Create artifacts
+        if: ${{ !cancelled() }}
+        run: |
+          tar -cf rocr-$(git rev-parse --short HEAD).tar rocr-install
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: linux_x86_64_distro
+          path: rocr-*.tar
+          if-no-files-found: warn
+
+      - name: Save cache
+        uses: actions/cache/save@v3
+        if: ${{ !cancelled() }}
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key:  ${{ env.CACHE_KEY }}
+
+      - name: Setup tmate session
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          limit-access-to-actor: true
+          install-dependencies: ${{ matrix.runs-on == 'ubuntu-22.04' }}
+
+  test_aie:
+    name: AIE tests
+    needs: build
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on: [linux-phoenix]
+    runs-on: ${{ matrix.runs-on }}
+    steps:
+      - name: "Checking out repository"
+        uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: linux_x86_64_distro
+
+      - name: Extract artifact
+        run: |
+          mkdir -p rocr-install
+          tar -xf rocr-*.tar
+          export hsa_runtime64_ROOT="$PWD/rocr-install" >> $GITHUB_ENV
+
+      - name: Build AIE test suite
+        run: |
+          pushd rocrtst/suites/aie
+          build_dir="$PWD/build"
+          mkdir -p $build_dir
+          cmake -GNinja \
+            -DCMAKE_BUILD_TYPE=Release \
+            -Dhsa_runtime64_ROOT="${hsa_runtime64_ROOT}" \
+            -S "$PWD" -B "$build_dir"
+          cmake --build "$build_dir" --target \
+            aie_hsa_bare_add_one \
+            aie_hsa_dispatch_test
+          popd
+
+      - name: Run AIE test suite
+        run: |
+          pushd rocrtst/suites/aie
+          build_dir="$PWD/build"
+          ./"$build_dir"/aie_hsa_bare_add_one $PWD
+          ./"$build_dir"/aie_hsa_dispatch_test $PWD
@@ -0,0 +1,8 @@
+find_package(hsa-runtime64 CONFIG REQUIRED NAMES hsa_runtime64)
+
+# smoke test
+add_executable(aie_hsa_bare_add_one aie_hsa_bare_add_one.cc)
+
+# hsa test
+add_executable(aie_hsa_dispatch_test aie_hsa_dispatch_test.cc)
+target_link_libraries(aie_hsa_dispatch_test PUBLIC hsa-runtime64::hsa-runtime64)
@@ -0,0 +1,68 @@
+06030100
+00000105
+00000007
+00000110
+00000001
+00000000
+0001D000
+00000030
+00000400
+00000000
+00000000
+00000000
+80000000
+00000000
+00000000
+02000000
+00000081
+00000030
+00000000
+00000000
+00000000
+00000000
+0001D004
+00000000
+00000001
+00000000
+00000000
+00000000
+00000000
+00000000
+0001D204
+00000000
+80000000
+00000018
+00000001
+00000000
+0001D020
+00000030
+00000400
+00000000
+00000000
+00000000
+80000000
+00000000
+00000000
+02000000
+00000081
+00000030
+00000000
+00000000
+00000000
+00000000
+0001D024
+00000000
+00000000
+00000000
+00000000
+00000000
+00000000
+00000000
+0001D214
+00000000
+00000001
+00000018
+00000080
+00000010
+00000000
+00010100
@@ -0,0 +1,309 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <sys/mman.h>
+
+#include <cassert>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
+
+namespace {
+
+hsa_status_t get_agent(hsa_agent_t agent, std::vector<hsa_agent_t> *agents,
+                       hsa_device_type_t requested_dev_type) {
+  if (!agents || !(requested_dev_type == HSA_DEVICE_TYPE_AIE ||
+                   requested_dev_type == HSA_DEVICE_TYPE_GPU ||
+                   requested_dev_type == HSA_DEVICE_TYPE_CPU)) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  hsa_device_type_t device_type;
+  hsa_status_t ret =
+      hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
+
+  if (ret != HSA_STATUS_SUCCESS) {
+    return ret;
+  }
+
+  if (device_type == requested_dev_type) {
+    agents->push_back(agent);
+  }
+
+  return ret;
+}
+
+hsa_status_t get_aie_agents(hsa_agent_t agent, void *data) {
+  if (!data) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  auto *aie_agents = reinterpret_cast<std::vector<hsa_agent_t> *>(data);
+  return get_agent(agent, aie_agents, HSA_DEVICE_TYPE_AIE);
+}
+
+hsa_status_t get_coarse_global_mem_pool(hsa_amd_memory_pool_t pool, void *data,
+                                        bool kernarg) {
+  hsa_amd_segment_t segment_type;
+  auto ret = hsa_amd_memory_pool_get_info(
+      pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type);
+  if (ret != HSA_STATUS_SUCCESS) {
+    return ret;
+  }
+
+  if (segment_type == HSA_AMD_SEGMENT_GLOBAL) {
+    hsa_amd_memory_pool_global_flag_t global_pool_flags;
+    ret = hsa_amd_memory_pool_get_info(
+        pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_pool_flags);
+    if (ret != HSA_STATUS_SUCCESS) {
+      return ret;
+    }
+
+    if (kernarg) {
+      if ((global_pool_flags &
+           HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) &&
+          (global_pool_flags & HSA_REGION_GLOBAL_FLAG_KERNARG)) {
+        *static_cast<hsa_amd_memory_pool_t *>(data) = pool;
+      }
+    } else {
+      if ((global_pool_flags &
+           HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) &&
+          !(global_pool_flags & HSA_REGION_GLOBAL_FLAG_KERNARG)) {
+        *static_cast<hsa_amd_memory_pool_t *>(data) = pool;
+      }
+    }
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t get_coarse_global_dev_mem_pool(hsa_amd_memory_pool_t pool,
+                                            void *data) {
+  return get_coarse_global_mem_pool(pool, data, false);
+}
+
+hsa_status_t get_coarse_global_kernarg_mem_pool(hsa_amd_memory_pool_t pool,
+                                                void *data) {
+  return get_coarse_global_mem_pool(pool, data, true);
+}
+
+void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
+                   void **buf) {
+  std::ifstream bin_file(file_name,
+                         std::ios::binary | std::ios::ate | std::ios::in);
+
+  assert(bin_file.fail() == false);
+
+  auto size(bin_file.tellg());
+
+  bin_file.seekg(0, std::ios::beg);
+  auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf);
+  assert(r == HSA_STATUS_SUCCESS);
+  bin_file.read(reinterpret_cast<char *>(*buf), size);
+}
+
+void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
+                   void **buf) {
+  std::ifstream bin_file(file_name,
+                         std::ios::binary | std::ios::ate | std::ios::in);
+
+  assert(bin_file.fail() == false);
+
+  auto size(bin_file.tellg());
+  bin_file.seekg(0, std::ios::beg);
+  std::vector<uint32_t> pdi_vec;
+  std::string val;
+
+  while (bin_file >> val) {
+    pdi_vec.push_back(std::stoul(val, nullptr, 16));
+  }
+  auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf);
+  assert(r == HSA_STATUS_SUCCESS);
+  std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t));
+}
+
+}  // namespace
+
+int main(int argc, char **argv) {
+  std::filesystem::path sourcePath(argv[1]);
+  // List of AIE agents in the system.
+  std::vector<hsa_agent_t> aie_agents;
+  // For creating a queue on an AIE agent.
+  hsa_queue_t *aie_queue(nullptr);
+  // Memory pool for allocating device-mapped memory. Used for PDI/DPU
+  // instructions.
+  hsa_amd_memory_pool_t global_dev_mem_pool{0};
+  // System memory pool. Used for allocating kernel argument data.
+  hsa_amd_memory_pool_t global_kernarg_mem_pool{0};
+  const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt");
+  const std::string pdi_file_name(sourcePath / "add_one.pdi");
+  uint32_t *dpu_inst_buf(nullptr);
+  uint64_t *pdi_buf(nullptr);
+
+  assert(aie_agents.empty());
+  assert(global_dev_mem_pool.handle == 0);
+  assert(global_kernarg_mem_pool.handle == 0);
+
+  // Initialize the runtime.
+  auto r = hsa_init();
+  assert(r == HSA_STATUS_SUCCESS);
+
+  assert(sizeof(hsa_kernel_dispatch_packet_s) ==
+         sizeof(hsa_amd_aie_ert_packet_s));
+
+  // Test a launch of an AIE kernel using the HSA API.
+  // Find the AIE agents in the system.
+  r = hsa_iterate_agents(get_aie_agents, &aie_agents);
+  assert(r == HSA_STATUS_SUCCESS);
+  //    assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) ==
+  //    HSA_STATUS_SUCCESS);
+  assert(aie_agents.size() == 1);
+
+  const auto &aie_agent = aie_agents.front();
+
+  // Create a queue on the first agent.
+  r = hsa_queue_create(aie_agent, 64, HSA_QUEUE_TYPE_SINGLE, nullptr, nullptr,
+                       0, 0, &aie_queue);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(aie_queue);
+  assert(aie_queue->base_address);
+
+  // Find a pool for DEV BOs. This is a global system memory pool that is
+  // mapped to the device. Will be used for PDIs and DPU instructions.
+  r = hsa_amd_agent_iterate_memory_pools(
+      aie_agent, get_coarse_global_dev_mem_pool, &global_dev_mem_pool);
+  assert(r == HSA_STATUS_SUCCESS);
+
+  // Find a pool that supports kernel args. This is just normal system memory.
+  // It will be used for commands and input data.
+  r = hsa_amd_agent_iterate_memory_pools(
+      aie_agent, get_coarse_global_kernarg_mem_pool, &global_kernarg_mem_pool);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(global_kernarg_mem_pool.handle);
+
+  // Load the DPU and PDI files into a global pool that doesn't support kernel
+  // args (DEV BO).
+  load_dpu_file(global_dev_mem_pool, dpu_inst_file_name,
+                reinterpret_cast<void **>(&dpu_inst_buf));
+  uint32_t dpu_handle = 0;
+  r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(dpu_handle != 0);
+
+  load_pdi_file(global_dev_mem_pool, pdi_file_name,
+                reinterpret_cast<void **>(&pdi_buf));
+  uint32_t pdi_handle = 0;
+  r = hsa_amd_get_handle_from_vaddr(pdi_buf, &pdi_handle);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(pdi_handle != 0);
+
+  hsa_amd_aie_ert_hw_ctx_cu_config_t cu_config{.cu_config_bo = pdi_handle,
+                                               .cu_func = 0};
+
+  hsa_amd_aie_ert_hw_ctx_config_cu_param_t config_cu_args{
+      .num_cus = 1, .cu_configs = &cu_config};
+
+  // Configure the queue's hardware context.
+  r = hsa_amd_queue_hw_ctx_config(
+      aie_queue, HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU, &config_cu_args);
+  assert(r == HSA_STATUS_SUCCESS);
+
+  // create inputs / outputs
+  constexpr std::size_t num_data_elements = 1024;
+  constexpr std::size_t data_buffer_size =
+      num_data_elements * sizeof(std::uint32_t);
+
+  std::uint32_t *input = {};
+  r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0,
+                                   reinterpret_cast<void **>(&input));
+  assert(r == HSA_STATUS_SUCCESS);
+  std::uint32_t input_handle = {};
+  r = hsa_amd_get_handle_from_vaddr(input, &input_handle);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(input_handle != 0);
+
+  std::uint32_t *output = {};
+  r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0,
+                                   reinterpret_cast<void **>(&output));
+  assert(r == HSA_STATUS_SUCCESS);
+  std::uint32_t output_handle = {};
+  r = hsa_amd_get_handle_from_vaddr(output, &output_handle);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(output_handle != 0);
+
+  for (std::size_t i = 0; i < num_data_elements; i++) {
+    *(input + i) = i;
+    *(output + i) = 0xDEFACE;
+  }
+
+  ///////////////////////////////////// Creating the cmd packet
+  // Creating a packet to store the command
+  hsa_amd_aie_ert_packet_t *cmd_pkt = NULL;
+  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
+                                   reinterpret_cast<void **>(&cmd_pkt));
+  assert(r == HSA_STATUS_SUCCESS);
+  cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
+  cmd_pkt->count = 0xA;  // # of arguments to put in command
+  cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
+  cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
+  cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
+                           << HSA_PACKET_HEADER_TYPE;
+
+  // Creating the payload for the packet
+  hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
+  uint32_t cmd_handle;
+  r = hsa_amd_get_handle_from_vaddr(reinterpret_cast<void *>(cmd_pkt),
+                                    &cmd_handle);
+  assert(r == HSA_STATUS_SUCCESS);
+  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
+                                   reinterpret_cast<void **>(&cmd_payload));
+  assert(r == HSA_STATUS_SUCCESS);
+  cmd_payload->cu_mask = 0x1;  // Selecting the PDI to use with this command
+  cmd_payload->data[0] = 0x3;  // Transaction opcode
+  cmd_payload->data[1] = 0x0;
+  cmd_payload->data[2] = dpu_handle;
+  cmd_payload->data[3] = 0x0;
+  cmd_payload->data[4] = 0x44;  // Size of DPU instruction
+  cmd_payload->data[5] = input_handle;
+  cmd_payload->data[6] = 0;
+  cmd_payload->data[7] = output_handle;
+  cmd_payload->data[8] = 0;
+  cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);
+
+  uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
+  uint64_t packet_id = wr_idx % aie_queue->size;
+  reinterpret_cast<hsa_amd_aie_ert_packet_t *>(
+      aie_queue->base_address)[packet_id] = *cmd_pkt;
+  hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx);
+
+  for (std::size_t i = 0; i < num_data_elements; i++) {
+    const auto expected = *(input + i) + 1;
+    const auto result = *(output + i);
+    assert(result == expected);
+  }
+
+  r = hsa_queue_destroy(aie_queue);
+  assert(r == HSA_STATUS_SUCCESS);
+
+  r = hsa_amd_memory_pool_free(output);
+  assert(r == HSA_STATUS_SUCCESS);
+  r = hsa_amd_memory_pool_free(input);
+  assert(r == HSA_STATUS_SUCCESS);
+  r = hsa_amd_memory_pool_free(pdi_buf);
+  assert(r == HSA_STATUS_SUCCESS);
+  r = hsa_amd_memory_pool_free(dpu_inst_buf);
+  assert(r == HSA_STATUS_SUCCESS);
+
+  r = hsa_shut_down();
+  assert(r == HSA_STATUS_SUCCESS);
+  std::cout << "PASS\n";
+}
@@ -0,0 +1,271 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#pragma once
+
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+
+#include "amdxdna_accel.h"
+
+// want to mmap the file
+
+#define MAX_NUM_INSTRUCTIONS 1024  // Maximum number of dpu or pdi instructions.
+
+// Dummy packet defines
+
+int map_doorbell(int fd, uint64_t *doorbell) {
+  // Mmap the mailbox.
+  int32_t page_size = 4096;
+  *doorbell = (uint64_t)mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+                             MAP_SHARED, fd, 0);
+  if (doorbell != MAP_FAILED) {
+    printf("Doorbell mapped\n");
+    return 0;
+  }
+
+  printf("[ERROR] doorbell mmap failed: %s\n", strerror(errno));
+  return errno;
+}
+
+void ring_doorbell(uint64_t doorbell) {
+  int32_t curr_tail = *((int32_t *)doorbell);
+  *((uint32_t *)doorbell) = curr_tail + 0x94;
+}
+
+int get_driver_version(int fd, __u32 *major, __u32 *minor) {
+  int ret;
+  amdxdna_drm_query_aie_version version;
+
+  amdxdna_drm_get_info info_params = {
+      .param = DRM_AMDXDNA_QUERY_AIE_VERSION,
+      .buffer_size = sizeof(version),
+      .buffer = (__u64)&version,
+  };
+
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_INFO, &info_params);
+  if (ret == 0) {
+    *major = version.major;
+    *minor = version.minor;
+  }
+
+  return ret;
+}
+
+/*
+        Allocates a heap on the device by creating a BO of type dev heap
+*/
+int alloc_heap(int fd, __u32 size, __u32 *handle) {
+  int ret;
+  void *heap_buf = NULL;
+  const size_t alignment = 64 * 1024 * 1024;
+  ret = posix_memalign(&heap_buf, alignment, size);
+  if (ret != 0 || heap_buf == NULL) {
+    printf("[ERROR] Failed to allocate heap buffer of size %d\n", size);
+  }
+
+  void *dev_heap_parent = mmap(0, alignment * 2 - 1, PROT_READ | PROT_WRITE,
+                               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+  if (dev_heap_parent == MAP_FAILED) {
+    dev_heap_parent = nullptr;
+    return -1;
+  }
+
+  amdxdna_drm_create_bo create_bo_params = {
+      .type = AMDXDNA_BO_DEV_HEAP,
+      .size = size,
+  };
+
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_params);
+  if (ret == 0 && handle) {
+    *handle = create_bo_params.handle;
+  }
+
+  amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo_params.handle};
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info);
+  if (ret != 0) {
+    perror("Failed to get BO info");
+    return -2;
+  }
+
+  // Need to free the heap buf but still use the address so we can
+  // ensure alignment
+  free(heap_buf);
+  heap_buf = (void *)mmap(heap_buf, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                          fd, get_bo_info.map_offset);
+  printf("Heap buffer @:                  %p\n", heap_buf);
+
+  return ret;
+}
+
+/*
+        Creates a dev bo which is carved out of the heap bo.
+*/
+int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, __u32 *handle,
+                  __u64 size_in_bytes) {
+  amdxdna_drm_create_bo create_bo = {
+      .type = AMDXDNA_BO_DEV,
+      .size = size_in_bytes,
+  };
+  int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo);
+  if (ret != 0) {
+    perror("Failed to create BO");
+    return -1;
+  }
+
+  amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle};
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info);
+  if (ret != 0) {
+    perror("Failed to get BO info");
+    return -2;
+  }
+
+  *vaddr = get_bo_info.vaddr;
+  *sram_vaddr = get_bo_info.xdna_addr;
+  *handle = create_bo.handle;
+  return 0;
+}
+
+/*
+        Creates a shmem bo
+*/
+int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr,
+                    __u32 *handle, __u64 size_in_bytes) {
+  const size_t alignment = 64 * 1024 * 1024;
+  void *shmem_create = NULL;
+  int ret = posix_memalign(&shmem_create, alignment, size_in_bytes);
+  if (ret != 0) {
+    printf("[ERROR] Failed to allocate shmem bo of size %lld\n", size_in_bytes);
+  }
+
+  // Touching buffer to map page
+  *(uint32_t *)shmem_create = 0xDEADBEEF;
+
+  printf("Shmem BO @:                     %p\n", shmem_create);
+
+  amdxdna_drm_create_bo create_bo = {.type = AMDXDNA_BO_SHMEM,
+                                     .vaddr = (__u64)shmem_create,
+                                     .size = size_in_bytes};
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo);
+  if (ret != 0) {
+    perror("Failed to create BO");
+    return -1;
+  }
+
+  amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle};
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info);
+  if (ret != 0) {
+    perror("Failed to get BO info");
+    return -2;
+  }
+
+  *vaddr = (__u64)shmem_create;
+  *sram_vaddr = get_bo_info.xdna_addr;
+  *handle = create_bo.handle;
+  return 0;
+}
+
+/*
+  Wrapper around synch bo ioctl.
+*/
+int sync_bo(int fd, __u32 handle) {
+  amdxdna_drm_sync_bo sync_params = {.handle = handle};
+  int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params);
+  if (ret != 0) {
+    printf("Synch bo ioctl failed for handle %d\n", handle);
+  }
+  return ret;
+}
+
+/*
+  Create a BO_DEV and populate it with a PDI
+*/
+
+int load_pdi(int fd, uint64_t *vaddr, uint64_t *sram_addr, __u32 *handle,
+             const char *path) {
+  FILE *file = fopen(path, "r");
+  if (file == NULL) {
+    perror("Failed to open instructions file.");
+    return -1;
+  }
+
+  fseek(file, 0L, SEEK_END);
+  ssize_t file_size = ftell(file);
+  fseek(file, 0L, SEEK_SET);
+
+  printf("Pdi file size: %ld\n", file_size);
+
+  fclose(file);
+
+  // Mmaping the file
+  int pdi_fd = open(path, O_RDONLY);
+  uint64_t *file_data =
+      (uint64_t *)mmap(0, file_size, PROT_READ, MAP_PRIVATE, pdi_fd, 0);
+
+  // Creating a BO_DEV bo to store the pdi file.
+  int ret = create_dev_bo(fd, vaddr, sram_addr, handle, file_size);
+  if (ret != 0) {
+    perror("Failed to create pdi BO");
+    return -1;
+  }
+
+  // copy the file into Bo dev
+  uint64_t *bo = (uint64_t *)*vaddr;
+  memcpy(bo, file_data, file_size);
+
+  close(pdi_fd);
+  return 0;
+}
+
+/*
+  Create a BO DEV and populate it with instructions whose virtual address is
+  passed to the driver via an HSA packet.
+*/
+int load_instructions(int fd, uint64_t *vaddr, uint64_t *sram_addr,
+                      __u32 *handle, const char *path, __u32 *num_inst) {
+  // read dpu instructions into an array
+  FILE *file = fopen(path, "r");
+  if (file == NULL) {
+    perror("Failed to open instructions file.");
+    return -1;
+  }
+
+  char *line = NULL;
+  size_t len = 0;
+  __u32 inst_array[MAX_NUM_INSTRUCTIONS];
+  __u32 inst_counter = 0;
+  while (getline(&line, &len, file) != -1) {
+    inst_array[inst_counter++] = strtoul(line, NULL, 16);
+    if (inst_counter >= MAX_NUM_INSTRUCTIONS) {
+      perror("Instruction array overflowed.");
+      return -2;
+    }
+  }
+  fclose(file);
+
+  // Creating a BO_DEV bo to store the instruction.
+  int ret =
+      create_dev_bo(fd, vaddr, sram_addr, handle, inst_counter * sizeof(__u32));
+  if (ret != 0) {
+    perror("Failed to create dpu BO");
+    return -3;
+  }
+
+  *num_inst = inst_counter;
+
+  memcpy((__u32 *)*vaddr, inst_array, inst_counter * sizeof(__u32));
+  return ret;
+}