From 1e34ec22929eaba7bcf1681350ec21aed8f370f7 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Mon, 5 Aug 2024 07:23:15 +0200
Subject: [PATCH] AMDGPU JIT compiler (#453)

* feat(AMD GPU): initial commit to support AMD GPU (working for ASM but not object code)

* feat(AMD GPU): :fire: :fire: end to end JIT compilation to AMD GPU is working
---
 constantine/math_compiler/codegen_amdgpu.nim  | 188 +++++
 constantine/math_compiler/codegen_nvidia.nim  |  12 +-
 constantine/platforms/abis/amdcomgr_abi.nim   | 735 ++++++++++++++++++
 constantine/platforms/abis/amdgpu_abi.nim     | 658 ++++++++++++++++
 .../{llvm/bindings => abis}/c_abi.nim         |   2 +-
 .../{llvm/bindings => abis}/llvm_abi.nim      | 261 ++++++-
 .../{llvm/bindings => abis}/nvidia_abi.nim    |   7 +-
 constantine/platforms/llvm/llvm.nim           |  23 +-
 research/codegen/x86_instr.nim                |   2 +-
 research/codegen/x86_poc.nim                  |   2 +-
 tests/gpu/hello_world_amdgpu.nim              | 211 +++++
 tests/gpu/hello_world_nvidia.nim              |  16 +-
 tests/gpu/t_nvidia_fp.nim                     |  12 +-
 13 files changed, 2093 insertions(+), 36 deletions(-)
 create mode 100644 constantine/math_compiler/codegen_amdgpu.nim
 create mode 100644 constantine/platforms/abis/amdcomgr_abi.nim
 create mode 100644 constantine/platforms/abis/amdgpu_abi.nim
 rename constantine/platforms/{llvm/bindings => abis}/c_abi.nim (99%)
 rename constantine/platforms/{llvm/bindings => abis}/llvm_abi.nim (73%)
 rename constantine/platforms/{llvm/bindings => abis}/nvidia_abi.nim (99%)
 create mode 100644 tests/gpu/hello_world_amdgpu.nim

diff --git a/constantine/math_compiler/codegen_amdgpu.nim b/constantine/math_compiler/codegen_amdgpu.nim
new file mode 100644
index 000000000..c9e84b9ee
--- /dev/null
+++ b/constantine/math_compiler/codegen_amdgpu.nim
@@ -0,0 +1,188 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/abis/amdgpu_abi {.all.},
+  constantine/platforms/abis/amdcomgr_abi,
+  constantine/platforms/abis/c_abi,
+  constantine/platforms/llvm/llvm,
+  constantine/platforms/primitives,
+  ./ir
+
+export
+  amdgpu_abi,
+  Flag, flag, wrapOpenArrayLenType
+
+# ############################################################
+#
+#                     AMD GPUs API
+#
+# ############################################################
+
+# Hip Runtime API
+# ------------------------------------------------------------
+
+template check*(status: HipError) =
+  ## Check the status code of a Hip operation
+  ## Exit program with error if failure
+
+  let code = status # ensure that the input expression is evaluated once only
+  if code != hipSuccess:
+    writeStackTrace()
+    stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
+    quit 1
+
+func hipModuleGetFunction*(kernel: var HipFunction, module: HipModule, fnName: openArray[char]): HipError {.inline.}=
+  hipModuleGetFunction(kernel, module, fnName[0].unsafeAddr)
+
+proc getGcnArchName*(deviceID: int32): string =
+  var prop: HipDeviceProp
+  check hipGetDeviceProperties(prop, deviceID)
+
+  for c in prop.gcnArchName:
+    if c != '\0':
+      result.add c
+
+proc hipDeviceInit*(deviceID = 0'i32): HipDevice =
+
+  check hipInit(deviceID.uint32)
+
+  var devCount: int32
+  check hipGetDeviceCount(devCount)
+  if devCount == 0:
+    echo "hipDeviceInit error: no devices supporting AMD ROCm/HIP"
+    quit 1
+
+  var hipDevice: HipDevice
+  check hipDeviceGet(hipDevice, deviceID)
+  var name = newString(128)
+  check hipDeviceGetName(name[0].addr, name.len.int32, hipDevice)
+  echo "Using HIP Device [", deviceID, "]: ", cstring(name)
+  echo "AMD GCN ARCH: ", deviceID.getGcnArchName()
+
+  return hipDevice
+
+# ############################################################
+#
+#                   LLVM IR for AMD GPUs
+#
+# ############################################################
+#
+# Note:
+#   __device__ functions for field and elliptic curve arithmetic
+#   might be compiled by default with scalar codegen
+#
+#   We will need to either:
+#   - Derive explicitly a vectorized version of the warp/wave size (32)
+#   - Derive implicitly a vectorized version, probably with __forceinline__
+
+proc wrapInCallableHipKernel*(module: ModuleRef, fn: FnDef) =
+  ## Create a public wrapper of a Hip device function
+  ##
+  ## A function named `addmod` can be found by appending _public
+  ##   check hipModuleGetFunction(fnPointer, cuModule, "addmod_public")
+
+  let pubName = fn.fnImpl.getName() & "_public"
+  let pubFn = module.addFunction(cstring(pubName), fn.fnTy)
+
+  let ctx = module.getContext()
+  let builder = ctx.createBuilder()
+  defer: builder.dispose()
+
+  let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody")
+  builder.positionAtEnd(blck)
+
+  var args = newSeq[ValueRef](fn.fnTy.countParamTypes())
+  for i, arg in mpairs(args):
+    arg = pubFn.getParam(i.uint32)
+  discard builder.call2(fn.fnTy, fn.fnImpl, args)
+
+  # A public kernel must return void
+  builder.retVoid()
+  pubFn.setCallingConvention(AMDGPU_KERNEL)
+
+# ############################################################
+#
+#                      Code generation
+#
+# ############################################################
+
+template check*(status: ComgrStatus) =
+  ## Check the status code of a Comgr operation
+  ## Exit program with error if failure
+
+  let code = status # ensure that the input expression is evaluated once only
+  if code != AMD_COMGR_STATUS_SUCCESS:
+    writeStackTrace()
+    stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
+    quit 1
+
+
+proc linkAmdGpu*(reloc_obj: seq[byte], gcnArchName: string): seq[byte] {.noInline.} =
+  ## Link a relocatable object code
+  ## into an executable that can be used through hipModuleLoadData
+  var roc: ComgrData
+  check amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, roc)
+  defer: check amd_comgr_release_data(roc)
+
+  var ai: ComgrActionInfo
+  check amd_comgr_create_action_info(ai)
+  defer: check amd_comgr_destroy_action_info(ai)
+
+  var ds: ComgrDataset
+  check amd_comgr_create_data_set(ds)
+  defer: check amd_comgr_destroy_data_set(ds)
+
+  var dsOut: ComgrDataset
+  check amd_comgr_create_data_set(dsOut)
+  defer: check amd_comgr_destroy_data_set(dsOut)
+
+  check roc.amd_comgr_set_data(reloc_obj.len.csize_t(), reloc_obj[0].addr)
+  check roc.amd_comgr_set_data_name("linkAmdGpu-input.o")
+  check ds.amd_comgr_data_set_add(roc)
+
+  check ai.amd_comgr_action_info_set_isa_name(
+    cstring("amdgcn-amd-amdhsa--" & gcnArchName)
+  )
+
+  check amd_comgr_do_action(
+    AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
+    info = ai,
+    input = ds,
+    output = dsOut)
+
+  # Extract the executable
+  # ------------------------------------------------
+
+  var exe: ComgrData
+  check amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, exe)
+  defer: check amd_comgr_release_data(exe)
+
+  check amd_comgr_action_data_get_data(
+    dsOut, AMD_COMGR_DATA_KIND_EXECUTABLE,
+    index = 0, exe)
+
+  # Query the required buffer size
+  var size: csize_t
+  check amd_comgr_get_data(
+    exe, size, nil)
+
+  # Size includes nul char
+  # But we use seq[byte] not a string, so Nim doesn't auto-inster a \0
+  # Hence allocation size is exact.
+  result.setLen(int size)
+
+  check amd_comgr_get_data(
+    exe, size, result[0].addr)
+
+
+# ############################################################
+#
+#                      Code execution
+#
+# ############################################################
diff --git a/constantine/math_compiler/codegen_nvidia.nim b/constantine/math_compiler/codegen_nvidia.nim
index 0315ad131..19e92019d 100644
--- a/constantine/math_compiler/codegen_nvidia.nim
+++ b/constantine/math_compiler/codegen_nvidia.nim
@@ -7,15 +7,15 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  constantine/platforms/llvm/bindings/nvidia_abi {.all.},
-  constantine/platforms/llvm/bindings/c_abi,
+  constantine/platforms/abis/nvidia_abi {.all.},
+  constantine/platforms/abis/c_abi,
   constantine/platforms/llvm/[llvm, nvidia_inlineasm],
   constantine/platforms/primitives,
   ./ir
 
 export
   nvidia_abi, nvidia_inlineasm,
-  Flag, flag
+  Flag, flag, wrapOpenArrayLenType
 
 # ############################################################
 #
@@ -131,7 +131,7 @@ proc tagCudaKernel(module: ModuleRef, fn: FnDef) =
     ]))
   )
 
-proc setCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
+proc wrapInCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
   ## Create a public wrapper of a cuda device function
   ##
   ## A function named `addmod` can be found by appending _public
@@ -202,7 +202,7 @@ proc codegenNvidiaPTX*(asy: Assembler_LLVM, sm: tuple[major, minor: int32]): str
     errMsg.dispose()
     quit 1
 
-  return machine.emitToString(asy.module, AssemblyFile)
+  return machine.emitTo[:string](asy.module, AssemblyFile)
 
 # ############################################################
 #
@@ -263,4 +263,4 @@ proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =
 
   check cuMemFree(rGPU)
   check cuMemFree(aGPU)
-  check cuMemFree(bGPU)
\ No newline at end of file
+  check cuMemFree(bGPU)
diff --git a/constantine/platforms/abis/amdcomgr_abi.nim b/constantine/platforms/abis/amdcomgr_abi.nim
new file mode 100644
index 000000000..0ce4b66d6
--- /dev/null
+++ b/constantine/platforms/abis/amdcomgr_abi.nim
@@ -0,0 +1,735 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# AMD Code Object Manager (comgr)
+# --------------------------------------------------------------------------------------------------
+#
+# https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr
+
+# Overview
+# Unlike Nvidia GPUs that use a Virtual ISA that is then recompiled
+# by the CUDA driver, AMD GPUs produce object code.
+#
+# That object code then must be linked by LLD
+#   https://llvm.org/docs/AMDGPUUsage.html#elf-code-object
+#
+# Unfortunately LLD is not designed as a library like the rest of LLVM
+# and in particular does not provide a C API
+#   https://groups.google.com/g/llvm-dev/c/K30vI0AU9vg?pli=1
+#
+# 1. We can link to it using C++ lld::elf::link (or lld::coff::link or ...)
+#   https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/lld/include/lld/Common/Driver.h#L52-L57
+#   This is what is done in MLIR:
+#   https://reviews.llvm.org/D80676#change-KfPawdnRjasK
+#
+# 2. Alternatively, if we don't want to compile Nim via C++
+#    we can have a .cpp file with extern "C"
+#    and customize the build with
+#    - querying the nimcache directory via std/compilersettings
+#    - staticExec the .cpp file
+#    - passL to link the resulting object
+#    - Use a C++ linker
+#    See https://github.com/xrfez/nim_mixed_c_cpp_static/blob/master/helloWorld.nim#L14-L18
+#
+# 3. Yet another alternative is to call LDD
+#
+# 4. MCJIT or OrcJIT can bypass the need of an external linker
+#    But calling GPU functions is not supported
+#    https://llvm.org/docs/JITLink.html#jitlink-availability-and-feature-status
+#
+# 5. It might be possible to use AMD HIP RTC API
+#    as it supports linking LLVM bitcode
+#    https://rocm.docs.amd.com/projects/HIP/en/docs-6.0.0/user_guide/hip_rtc.html
+#
+# Criticisms
+#
+# Solution 3 seems to be wildly adopted, in Julia, IREE and Taichi
+# - https://github.com/iree-org/iree/blob/26f77de/compiler/plugins/target/LLVMCPU/internal/EmbeddedLinkerTool.cpp#L42-L48
+# - https://github.com/JuliaGPU/AMDGPU.jl/blob/v0.9.6/src/compiler/codegen.jl#L148-L154
+# - https://github.com/taichi-dev/taichi/pull/6482/files#diff-9ab763eb7ff4e6aca0a97f774cc740c609d0258ac584039d0c8cae099dfea452R90
+#
+# However ldd is shipped as a separate tool from LLVM, and as a cryptographic library
+# we need to minimize the attack surface, i.e. someone installing a "ldd" script that would be executed byu our code.
+# It's harder to replace library paths as those need root or are restricted to a shell session if overloading LD_LIBRARY_PATH
+#
+# Solution 1 needs temporary files
+# Solution 2 too and seems hard to maintain
+# Solution 4 is a non-starter
+#
+# Solution 5 is likely possible as the header offers the following enum value
+# to pass to the linker "hiprtcJITInputType: HIPRTC_JIT_INPUT_OBJECT"
+# However we don't really need the full RTC since we already did LLVM IR -> object file
+#
+# Looking deeper into hipRTC we see that it depends on comgr, just like the HIP runtime
+# and comgr only roles is dealing with object file.
+# It does use LLD under-the-hood but from a fork specialized for AMD purposes:
+#   https://github.com/ROCm/llvm-project/blob/rocm-6.2.0/amd/comgr/src/comgr-compiler.cpp#L614-L630
+# Hence we solve all of our concerns.
+
+const
+  # Generated from Comgr 2.6
+  AMD_COMGR_INTERFACE_VERSION_MAJOR {.used.} = 2
+  AMD_COMGR_INTERFACE_VERSION_MINOR {.used.} = 6
+
+##  \defgroup codeobjectmanager Code Object Manager
+##   @{
+##
+##  @brief The code object manager is a callable library that provides
+##  operations for creating and inspecting code objects.
+##
+##  The library provides handles to various objects. Concurrent execution of
+##  operations is supported provided all objects accessed by each concurrent
+##  operation are disjoint. For example, the @p amd_comgr_data_set_t handles
+##  passed to operations must be disjoint, together with all the @p
+##  amd_comgr_data_t handles that have been added to it. The exception is that
+##  the default device library data object handles can be non-disjoint as they
+##  are imutable.
+##
+##  The library supports generating and inspecting code objects that
+##  contain machine code for a certain set of instruction set
+##  arhitectures (isa). The set of isa supported and information about
+##  the properties of the isa can be queried.
+##
+##  The library supports performing an action that can take data
+##  objects of one kind, and generate new data objects of another kind.
+##
+##  Data objects are referenced using handles using @p
+##  amd_comgr_data_t. The kinds of data objects are given
+##  by @p amd_comgr_data_kind_t.
+##
+##  To perform an action, two @p amd_comgr_data_set_t
+##  objects are created. One is used to hold all the data objects
+##  needed by an action, and other is updated by the action with all
+##  the result data objects. In addition, an @p
+##  amd_comgr_action_info_t is created to hold
+##  information that controls the action. These are then passed to @p
+##  amd_comgr_do_action to perform an action specified by
+##  @p amd_comgr_action_kind_t.
+##
+##  Some data objects can have associated metadata. There are
+##  operations for querying this metadata.
+##
+##  The default device library that satisfies the requirements of the
+##  compiler action can be obtained.
+##
+##  The library inspects some environment variables to aid in debugging. These
+##  include:
+##  - @p AMD_COMGR_SAVE_TEMPS: If this is set, and is not "0", the library does
+##    not delete temporary files generated while executing compilation actions.
+##    These files do not appear in the current working directory, but are
+##    instead left in a platform-specific temporary directory (/tmp on Linux and
+##    C:\Temp or the path found in the TEMP environment variable on Windows).
+##  - @p AMD_COMGR_REDIRECT_LOGS: If this is not set, or is set to "0", logs are
+##    returned to the caller as normal. If this is set to "stdout"/"-" or
+##    "stderr", logs are instead redirected to the standard output or error
+##    stream, respectively. If this is set to any other value, it is interpreted
+##    as a filename which logs should be appended to. Logs may be redirected
+##    irrespective of whether logging is enabled.
+##  - @p AMD_COMGR_EMIT_VERBOSE_LOGS: If this is set, and is not "0", logs will
+##    include additional Comgr-specific informational messages.
+##
+##
+##  @brief Status codes.
+
+type
+  ComgrStatus* {.size: sizeof(cint).}  = enum
+    # From amd_comgr_status_t
+    AMD_COMGR_STATUS_SUCCESS = 0x0, ## The function has been executed successfully.
+    AMD_COMGR_STATUS_ERROR = 0x1, ## A generic error has occurred.
+    AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT = 0x2,
+      ##  One of the actual arguments does not meet a precondition stated
+      ##  in the documentation of the corresponding formal argument. This
+      ##  includes both invalid Action types, and invalid arguments to
+      ##  valid Action types.
+    AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES = 0x3 ##  Failed to allocate the necessary resources.
+
+type
+  ComgrDataKind* {.size: sizeof(cint).} = enum
+    # From amd_comgr_data_kind_t
+    AMD_COMGR_DATA_KIND_UNDEF = 0x0,       ## No data is available.
+    AMD_COMGR_DATA_KIND_SOURCE = 0x1,      ## The data is a textual main source.
+    AMD_COMGR_DATA_KIND_INCLUDE = 0x2,
+      ##  The data is a textual source that is included in the main source
+      ##  or other include source.
+    AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER = 0x3,
+      ##  The data is a precompiled-header source that is included in the main
+      ##  source or other include source.
+    AMD_COMGR_DATA_KIND_DIAGNOSTIC = 0x4,  ##  The data is a diagnostic output.
+    AMD_COMGR_DATA_KIND_LOG = 0x5,         ## The data is a textual log output.
+    AMD_COMGR_DATA_KIND_BC = 0x6,          ## The data is compiler LLVM IR bit code for a specific isa.
+    AMD_COMGR_DATA_KIND_RELOCATABLE = 0x7, ## The data is a relocatable machine code object for a specific isa.
+    AMD_COMGR_DATA_KIND_EXECUTABLE = 0x8,
+      ##  The data is an executable machine code object for a specific
+      ##  isa. An executable is the kind of code object that can be loaded
+      ##  and executed.
+    AMD_COMGR_DATA_KIND_BYTES = 0x9,       ## The data is a block of bytes.
+    AMD_COMGR_DATA_KIND_FATBIN = 0x10,     ## The data is a fat binary (clang-offload-bundler output).
+    AMD_COMGR_DATA_KIND_AR = 0x11,         ##  The data is an archive.
+    AMD_COMGR_DATA_KIND_BC_BUNDLE = 0x12,  ## The data is a bundled bitcode.
+    AMD_COMGR_DATA_KIND_AR_BUNDLE = 0x13   ## The data is a bundled archive.
+
+
+
+type
+  ComGrActionKind* {.size: sizeof(cint).} = enum
+    ##
+    ##  @brief The kinds of actions that can be performed.
+    ##
+    # From amd_comgr_action_kind_t
+    AMD_COMGR_ACTION_SOURCE_TO_PREPROCESSOR = 0x0,
+      ##  Preprocess each source data object in @p input in order. For each
+      ##  successful preprocessor invocation, add a source data object to @p result.
+      ##  Resolve any include source names using the names of include data objects
+      ##  in @p input. Resolve any include relative path names using the working
+      ##  directory path in @p info. Preprocess the source for the language in @p
+      ##  info.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if any preprocessing fails.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name or language is not set in @p info.
+    AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS = 0x1,
+      ##  Copy all existing data objects in @p input to @p output, then add the
+      ##  device-specific and language-specific precompiled headers required for
+      ##  compilation.
+      ##
+      ##  Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2
+      ##  and @p AMD_COMGR_LANGUAGE_OPENCL_2_0.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language
+      ##  is not set in @p info, or the language is not supported.
+    AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC = 0x2,
+      ##  Compile each source data object in @p input in order. For each
+      ##  successful compilation add a bc data object to @p result. Resolve
+      ##  any include source names using the names of include data objects
+      ##  in @p input. Resolve any include relative path names using the
+      ##  working directory path in @p info. Produce bc for isa name in @p
+      ##  info. Compile the source for the language in @p info.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if any compilation
+      ##  fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name or language is not set in @p info.
+    AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES = 0x3,
+      ##  Copy all existing data objects in @p input to @p output, then add the
+      ##  device-specific and language-specific bitcode libraries required for
+      ##  compilation.
+      ##
+      ##  Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2,
+      ##  @p AMD_COMGR_LANGUAGE_OPENCL_2_0, and @p AMD_COMGR_LANGUAGE_HIP.
+      ##
+      ##  The options in @p info should be set to a set of language-specific flags.
+      ##  For OpenCL and HIP these include:
+      ##
+      ##     correctly_rounded_sqrt
+      ##     daz_opt
+      ##     finite_only
+      ##     unsafe_math
+      ##     wavefrontsize64
+      ##
+      ##  For example, to enable daz_opt and unsafe_math, the options should be set
+      ##  as:
+      ##
+      ##     const char *options[] = {"daz_opt, "unsafe_math"};
+      ##     size_t optionsCount = sizeof(options) / sizeof(options[0]);
+      ##
+      ## amd_comgr_action_info_set_option_list(info, options, optionsCount);
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language
+      ##  is not set in @p info, the language is not supported, an unknown
+      ##  language-specific flag is supplied, or a language-specific flag is
+      ##  repeated.
+      ##
+      ##  @deprecated since 1.7
+      ##  @warning This action, followed by @c AMD_COMGR_ACTION_LINK_BC_TO_BC, may
+      ##  result in subtle bugs due to incorrect linking of the device libraries.
+      ##  The @c
+      ## AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC action can
+      ##  be used as a workaround which ensures the link occurs correctly.
+    AMD_COMGR_ACTION_LINK_BC_TO_BC = 0x4, ##
+      ##  Link a collection of bitcodes, bundled bitcodes, and bundled bitcode
+      ##  archives in @p into a single composite (unbundled) bitcode @p.
+      ##  Any device library bc data object must be explicitly added to @p input if
+      ##  needed.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if the link or unbundling fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name is not set in @p info and does not match the isa name
+      ##  of all bc data objects in @p input.
+    AMD_COMGR_ACTION_OPTIMIZE_BC_TO_BC = 0x5,
+      ##  Optimize each bc data object in @p input and create an optimized bc data
+      ##  object to @p result.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if the optimization fails.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name is not set in @p info and does not match the isa name
+      ##  of all bc data objects in @p input.
+    AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE = 0x6,
+      ##  Perform code generation for each bc data object in @p input in
+      ##  order. For each successful code generation add a relocatable data
+      ##  object to @p result.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if any code
+      ##  generation fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name is not set in @p info and does not match the isa name
+      ##  of all bc data objects in @p input.
+    AMD_COMGR_ACTION_CODEGEN_BC_TO_ASSEMBLY = 0x7,
+      ##  Perform code generation for each bc data object in @p input in
+      ##  order. For each successful code generation add an assembly source data
+      ##  object to @p result.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if any code
+      ##  generation fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name is not set in @p info and does not match the isa name
+      ##  of all bc data objects in @p input.
+    AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_RELOCATABLE = 0x8,
+      ##  Link each relocatable data object in @p input together and add
+      ##  the linked relocatable data object to @p result. Any device
+      ##  library relocatable data object must be explicitly added to @p
+      ##  input if needed.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if the link fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name is not set in @p info and does not match the isa name
+      ##  of all relocatable data objects in @p input.
+    AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE = 0x9,
+      ##  Link each relocatable data object in @p input together and add
+      ##  the linked executable data object to @p result. Any device
+      ##  library relocatable data object must be explicitly added to @p
+      ##  input if needed.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if the link fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name is not set in @p info and does not match the isa name
+      ##  of all relocatable data objects in @p input.
+    AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE = 0xA,
+      ##  Assemble each source data object in @p input in order into machine code.
+      ##  For each successful assembly add a relocatable data object to @p result.
+      ##  Resolve any include source names using the names of include data objects in
+      ##  @p input. Resolve any include relative path names using the working
+      ##  directory path in @p info. Produce relocatable for isa name in @p info.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if any assembly fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name is not set in
+      ##  @p info.
+    AMD_COMGR_ACTION_DISASSEMBLE_RELOCATABLE_TO_SOURCE = 0xB,
+      ##  Disassemble each relocatable data object in @p input in
+      ##  order. For each successful disassembly add a source data object to
+      ##  @p result.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if any disassembly
+      ##  fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name is not set in @p info and does not match the isa name
+      ##  of all relocatable data objects in @p input.
+    AMD_COMGR_ACTION_DISASSEMBLE_EXECUTABLE_TO_SOURCE = 0xC,
+      ##  Disassemble each executable data object in @p input in order. For
+      ##  each successful disassembly add a source data object to @p result.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR if any disassembly
+      ##  fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name is not set in @p info and does not match the isa name
+      ##  of all relocatable data objects in @p input.
+    AMD_COMGR_ACTION_DISASSEMBLE_BYTES_TO_SOURCE = 0xD,
+      ##  Disassemble each bytes data object in @p input in order. For each
+      ##  successful disassembly add a source data object to @p
+      ##  result. Only simple assembly language commands are generate that
+      ##  corresponf to raw bytes are supported, not any directives that
+      ##  control the code object layout, or symbolic branch targets or
+      ##  names.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if any disassembly
+      ##  fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name is not set in @p info
+    AMD_COMGR_ACTION_COMPILE_SOURCE_TO_FATBIN = 0xE,
+      ##  Compile each source data object in @p input in order. For each
+      ##  successful compilation add a fat binary to @p result. Resolve
+      ##  any include source names using the names of include data objects
+      ##  in @p input. Resolve any include relative path names using the
+      ##  working directory path in @p info. Produce fat binary for isa name in @p
+      ##  info. Compile the source for the language in @p info.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if any compilation
+      ##  fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name or language is not set in @p info.
+      ##
+      ##  @deprecated since 2.5
+      ##  @see in-process compilation via
+      ## AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, etc.
+      ##  instead
+
+    AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC = 0xF
+      ##  Compile each source data object in @p input in order. For each
+      ##  successful compilation add a bc data object to @p result. Resolve
+      ##  any include source names using the names of include data objects
+      ##  in @p input. Resolve any include relative path names using the
+      ##  working directory path in @p info. Produce bc for isa name in @p
+      ##  info. Compile the source for the language in @p info. Link against
+      ##  the device-specific and language-specific bitcode device libraries
+      ##  required for compilation.
+      ##
+      ##  Return @p AMD_COMGR_STATUS_ERROR if any compilation
+      ##  fails.
+      ##
+      ##  Return @p
+      ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+      ##  if isa name or language is not set in @p info.
+
+type
+  ComgrData* {.bycopy.} = object
+    ##  @brief A handle to a data object.
+    ##
+    ##  Data objects are used to hold the data which is either an input or
+    ##  output of a code object manager action.
+    # From amd_comgr_data_t
+    handle*: uint64
+
+  ComgrActionInfo* {.bycopy.} = object
+    ##  @brief A handle to an action information object.
+    ##
+    ##  An action information object holds all the necessary information,
+    ##  excluding the input data objects, required to perform an action.
+    # From amd_comgr_action_info_t
+    handle*: uint64
+
+type
+  ComgrDataset* {.bycopy.} = object
+    ##  @brief A handle to an action information object.
+    ##
+    ##  An action information object holds all the necessary information,
+    ##  excluding the input data objects, required to perform an action.
+    # From amd_comgr_data_set_t
+    handle*: uint64
+
+
+const libPath = "/opt/rocm/lib/" # For now, only support Linux
+static: echo "[Constantine] Will search AMD Comgr in $LD_LIBRARY_PATH and " & libPath & "libamd_comgr.so"
+const libAmdComgr = "(libamd_comgr.so|" & libPath & "libamd_comgr.so)"
+
+{.push noconv, importc, dynlib: libAmdComgr.}
+
+proc amd_comgr_create_data*(kind: ComgrDataKind; data: var ComgrData): ComgrStatus
+  ##  @brief Create a data object that can hold data of a specified kind.
+  ##
+  ##  Data objects are reference counted and are destroyed when the
+  ##  reference count reaches 0. When a data object is created its
+  ##  reference count is 1, it has 0 bytes of data, it has an empty name,
+  ##  and it has no metadata.
+  ##
+  ##  @param[in] kind The kind of data the object is intended to hold.
+  ##
+  ##  @param[out] data A handle to the data object created. Its reference
+  ##  count is set to 1.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has
+  ##  been executed successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p
+  ##  kind is an invalid data kind, or @p
+  ##  AMD_COMGR_DATA_KIND_UNDEF. @p data is NULL.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES
+  ##  Unable to create the data object as out of resources.
+
+proc amd_comgr_release_data*(data: ComgrData): ComgrStatus
+  ##  @brief Indicate that no longer using a data object handle.
+  ##
+  ##  The reference count of the associated data object is
+  ##  decremented. If it reaches 0 it is destroyed.
+  ##
+  ##  @param[in] data The data object to release.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has
+  ##  been executed successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p
+  ##  data is an invalid data object, or has kind @p
+  ##  AMD_COMGR_DATA_KIND_UNDEF.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES
+  ##  Unable to update the data object as out of resources.
+
+proc amd_comgr_set_data*(data: ComgrData; size: csize_t; bytes: pointer): ComgrStatus
+  ##  @brief Set the data content of a data object to the specified
+  ##  bytes.
+  ##
+  ##  Any previous value of the data object is overwritten. Any metadata
+  ##  associated with the data object is also replaced which invalidates
+  ##  all metadata handles to the old metadata.
+  ##
+  ##  @param[in] data The data object to update.
+  ##
+  ##  @param[in] size The number of bytes in the data specified by @p bytes.
+  ##
+  ##  @param[in] bytes The bytes to set the data object to. The bytes are
+  ##  copied into the data object and can be freed after the call.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has
+  ##  been executed successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p
+  ##  data is an invalid data object, or has kind @p
+  ##  AMD_COMGR_DATA_KIND_UNDEF.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES
+  ##  Unable to update the data object as out of resources.
+
+proc amd_comgr_set_data_name*(data: ComgrData; name: cstring): ComgrStatus
+  ##  @brief Set the name associated with a data object.
+  ##
+  ##  When compiling, the full name of an include directive is used to
+  ##  reference the contents of the include data object with the same
+  ##  name. The name may also be used for other data objects in log and
+  ##  diagnostic output.
+  ##
+  ##  @param[in] data The data object to update.
+  ##
+  ##  @param[in] name A null terminated string that specifies the name to
+  ##  use for the data object. If NULL then the name is set to the empty
+  ##  string.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has
+  ##  been executed successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p
+  ##  data is an invalid data object, or has kind @p
+  ##  AMD_COMGR_DATA_KIND_UNDEF.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES
+  ##  Unable to update the data object as out of resources.
+
+proc amd_comgr_get_data*(data: ComgrData; size: var csize_t; bytes: pointer): ComgrStatus
+  ##  @brief Get the data object name and/or name length.
+  ##
+  ##  @param[in] data The data object to query.
+  ##
+  ##  @param[in, out] size On entry, the size of @p name. On return, the size of
+  ##  the data object name including the terminating null character.
+  ##
+  ##  @param[out] name If not NULL, then the first @p size characters of the
+  ##  data object name are copied. If @p name is NULL, only @p size is updated
+  ##  (useful in order to find the size of buffer required to copy the name).
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has
+  ##  been executed successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p
+  ##  data is an invalid data object, or has kind @p
+  ##  AMD_COMGR_DATA_KIND_UNDEF. @p size is NULL.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES
+  ##  Unable to update the data object as out of resources.
+
+proc amd_comgr_create_action_info*(action_info: var ComgrActionInfo): ComgrStatus
+  ##  @brief Create an action info object.
+  ##
+  ##  @param[out] action_info A handle to the action info object created.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has
+  ##  been executed successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p
+  ##  action_info is NULL.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES
+  ##  Unable to create the action info object as out of resources.
+
+proc amd_comgr_destroy_action_info*(action_info: ComgrActionInfo): ComgrStatus
+  ##  @brief Destroy an action info object.
+  ##
+  ##  @param[in] action_info A handle to the action info object to destroy.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has
+  ##  been executed successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p
+  ##  action_info is an invalid action info object.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES
+  ##  Unable to update action info object as out of resources.
+
+proc amd_comgr_action_info_set_isa_name*(action_info: ComgrActionInfo;
+                                        isa_name: cstring): ComgrStatus
+
+  ##  @brief Set the isa name of an action info object.
+  ##
+  ##  When an action info object is created it has no isa name. Some
+  ##  actions require that the action info object has an isa name
+  ##  defined.
+  ##
+  ##  @param[in] action_info A handle to the action info object to be
+  ##  updated.
+  ##
+  ##  @param[in] isa_name A null terminated string that is the isa name. If NULL
+  ##  or the empty string then the isa name is cleared. The isa name is defined as
+  ##  the Code Object Target Identification string, described at
+  ##  https://llvm.org/docs/AMDGPUUsage.html#code-object-target-identification
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has
+  ##  been executed successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p
+  ##  action_info is an invalid action info object. @p isa_name is not an
+  ##  isa name supported by this version of the code object manager
+  ##  library.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES
+  ##  Unable to update action info object as out of resources.
+  ##
+  ## -----
+  ## Example ISA: "amdgcn-amd-amdhsa--gfx900"
+
+proc amd_comgr_create_data_set*(data_set: var ComgrDataset): ComgrStatus
+  ##  @brief Create a data set object.
+  ##
+  ##  @param[out] data_set A handle to the data set created. Initially it
+  ##  contains no data objects.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed
+  ##  successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is NULL.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to create the data
+  ##  set object as out of resources.
+
+proc amd_comgr_destroy_data_set*(data_set: ComgrDataset): ComgrStatus
+  ##  @brief Destroy a data set object.
+  ##
+  ##  The reference counts of any associated data objects are decremented. Any
+  ##  handles to the data set object become invalid.
+  ##
+  ##  @param[in] data_set A handle to the data set object to destroy.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed
+  ##  successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid
+  ##  data set object.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update data set
+  ##  object as out of resources.
+
+proc amd_comgr_data_set_add*(data_set: ComgrDataset; data: ComgrData): ComgrStatus
+  ##  @brief Add a data object to a data set object if it is not already added.
+  ##
+  ##  The reference count of the data object is incremented.
+  ##
+  ##  @param[in] data_set A handle to the data set object to be updated.
+  ##
+  ##  @param[in] data A handle to the data object to be added. If @p data_set
+  ##  already has the specified handle present, then it is not added. The order
+  ##  that data objects are added is preserved.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed
+  ##  successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid
+  ##  data set object. @p data is an invalid data object; has undef kind; has
+  ##  include kind but does not have a name.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update data set
+  ##  object as out of resources.
+
+proc amd_comgr_do_action*(kind: ComGrActionKind;
+                         info: ComgrActionInfo;
+                         input: ComgrDataset; output: ComgrDataset): ComgrStatus
+
+  ##  @brief Perform an action.
+  ##
+  ##  Each action ignores any data objects in @p input that it does not
+  ##  use. If logging is enabled in @info then @p result will have a log
+  ##  data object added. Any diagnostic data objects produced by the
+  ##  action will be added to @p result. See the description of each
+  ##  action in @p amd_comgr_action_kind_t.
+  ##
+  ##  @param[in] kind The action to perform.
+  ##
+  ##  @param[in] info The action info to use when performing the action.
+  ##
+  ##  @param[in] input The input data objects to the @p kind action.
+  ##
+  ##  @param[out] result Any data objects are removed before performing
+  ##  the action which then adds all data objects produced by the action.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has
+  ##  been executed successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR An error was
+  ##  reported when executing the action.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p
+  ##  kind is an invalid action kind. @p input_data or @p result_data are
+  ##  invalid action data object handles. See the description of each
+  ##  action in @p amd_comgr_action_kind_t for other
+  ##  conditions that result in this status.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES
+  ##  Unable to update the data object as out of resources.
+
+proc amd_comgr_action_data_get_data*(data_set: ComgrDataset;
+                                    data_kind: ComgrDataKind;
+                                    index: csize_t; data: var ComgrData): ComgrStatus
+  ##  @brief Return the Nth data object of a specified data kind that is added to a
+  ##  data set object.
+  ##
+  ##  The reference count of the returned data object is incremented.
+  ##
+  ##  @param[in] data_set A handle to the data set object to be queried.
+  ##
+  ##  @param[in] data_kind The data kind of the data object to be returned.
+  ##
+  ##  @param[in] index The index of the data object of data kind @data_kind to be
+  ##  returned. The first data object is index 0. The order of data objects matches
+  ##  the order that they were added to the data set object.
+  ##
+  ##  @param[out] data The data object being requested.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed
+  ##  successfully.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid
+  ##  data set object. @p data_kind is an invalid data kind or @p
+  ##  AMD_COMGR_DATA_KIND_UNDEF. @p index is greater than the number of data
+  ##  objects of kind @p data_kind. @p data is NULL.
+  ##
+  ##  @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to query data set
+  ##  object as out of resources.
+
+{.pop.} # noconv, importc, dynlib: libAmdComgr
diff --git a/constantine/platforms/abis/amdgpu_abi.nim b/constantine/platforms/abis/amdgpu_abi.nim
new file mode 100644
index 000000000..d2e6d6629
--- /dev/null
+++ b/constantine/platforms/abis/amdgpu_abi.nim
@@ -0,0 +1,658 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#               Bindings to AMD GPUs libraries
+#
+# ############################################################
+
+import ./c_abi
+
+# ############################################################
+#
+#                         HIP
+#
+# ############################################################
+
+# Cuda Driver API -> Hip porting guide
+# - https://rocm.docs.amd.com/projects/HIP/en/docs-5.7.1/user_guide/hip_porting_driver_api.html
+# - https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html
+#
+# c2nim on /opt/rocm/include/hip/hip_runtime_api.h
+# or just copy-pasting can be used.
+
+const libPath = "/opt/rocm/lib/" # For now, only support Linux
+static: echo "[Constantine] Will search AMD HIP runtime in $LD_LIBRARY_PATH and " & libPath & "libamdhip64.so"
+const libAmdHip = "(libamdhip64.so|" & libPath & "libamdhip64.so)"
+
+type
+  HipError* {.size: sizeof(cint).} = enum
+    ## hipError_t
+    hipSuccess                     = 0 ## Successful completion.
+    hipErrorInvalidValue           = 1 ## One or more of the parameters passed to the API call is NULL
+                                       ## or not in an acceptable range.
+    # hipErrorOutOfMemory = 2          ## out of memory range.
+                                       ## Deprecated
+    hipErrorMemoryAllocation = 2       ## Memory allocation error.
+    # hipErrorNotInitialized   = 3     ## Invalid not initialized
+                                       ## Deprecated
+    hipErrorInitializationError    = 3
+    hipErrorDeinitialized          = 4   ## Deinitialized
+    hipErrorProfilerDisabled       = 5
+    hipErrorProfilerNotInitialized = 6
+    hipErrorProfilerAlreadyStarted = 7
+    hipErrorProfilerAlreadyStopped = 8
+    hipErrorInvalidConfiguration   = 9   ## Invalide configuration
+    hipErrorInvalidPitchValue      = 12  ## Invalid pitch value
+    hipErrorInvalidSymbol          = 13  ## Invalid symbol
+    hipErrorInvalidDevicePointer   = 17  ## Invalid Device Pointer
+    hipErrorInvalidMemcpyDirection = 21  ## Invalid memory copy direction
+    hipErrorInsufficientDriver     = 35
+    hipErrorMissingConfiguration   = 52
+    hipErrorPriorLaunchFailure     = 53
+    hipErrorInvalidDeviceFunction  = 98  ## Invalid device function
+    hipErrorNoDevice               = 100 ## Call to hipGetDeviceCount returned 0 devices
+    hipErrorInvalidDevice          = 101 ## DeviceID must be in range from 0 to compute-devices.
+    hipErrorInvalidImage           = 200 ## Invalid image
+    hipErrorInvalidContext         = 201 ## Produced when input context is invalid.
+    hipErrorContextAlreadyCurrent  = 202
+    # hipErrorMapFailed              = 205
+                                         ## Deprecated
+    hipErrorMapBufferObjectFailed      = 205 ## Produced when the IPC memory attach failed from ROCr.
+    hipErrorUnmapFailed                = 206
+    hipErrorArrayIsMapped              = 207
+    hipErrorAlreadyMapped              = 208
+    hipErrorNoBinaryForGpu             = 209
+    hipErrorAlreadyAcquired            = 210
+    hipErrorNotMapped                  = 211
+    hipErrorNotMappedAsArray           = 212
+    hipErrorNotMappedAsPointer         = 213
+    hipErrorECCNotCorrectable          = 214
+    hipErrorUnsupportedLimit           = 215 ## Unsupported limit
+    hipErrorContextAlreadyInUse        = 216 ## The context is already in use
+    hipErrorPeerAccessUnsupported      = 217
+    hipErrorInvalidKernelFile          = 218 ## In CUDA DRV it is CUDA_ERROR_INVALID_PTX
+    hipErrorInvalidGraphicsContext     = 219
+    hipErrorInvalidSource              = 300 ## Invalid source.
+    hipErrorFileNotFound               = 301 ## the file is not found.
+    hipErrorSharedObjectSymbolNotFound = 302
+    hipErrorSharedObjectInitFailed     = 303 ## Failed to initialize shared object.
+    hipErrorOperatingSystem            = 304 ## Not the correct operating system
+    # hipErrorInvalidHandle            = 400 ## Invalide handle
+                                           ## Deprecated
+    hipErrorInvalidResourceHandle = 400 ## Resource handle (hipEvent_t or hipStream_t) invalid.
+    hipErrorIllegalState          = 401 ## Resource required is not in a valid state to perform operation.
+    hipErrorNotFound              = 500 ## Not found
+    hipErrorNotReady              = 600 ## Indicates that asynchronous operations enqueued earlier are not
+                                        ## ready.  This is not actually an error but is used to distinguish
+                                        ## from hipSuccess (which indicates completion).  APIs that return
+                                        ## this error include hipEventQuery and hipStreamQuery.
+    hipErrorIllegalAddress           = 700
+    hipErrorLaunchOutOfResources     = 701 ## Out of resources error.
+    hipErrorLaunchTimeOut            = 702 ## Timeout for the launch.
+    hipErrorPeerAccessAlreadyEnabled = 704  ## Peer access was already enabled from the current device.
+    hipErrorPeerAccessNotEnabled     = 705  ## Peer access was never enabled from the current device.
+    hipErrorSetOnActiveProcess          = 708 ## The process is active.
+    hipErrorContextIsDestroyed          = 709 ## The context is already destroyed
+    hipErrorAssert                      = 710 ## Produced when the kernel calls assert.
+    hipErrorHostMemoryAlreadyRegistered = 712 ## Produced when trying to lock a page-locked memory.
+    hipErrorHostMemoryNotRegistered     = 713 ## Produced when trying to unlock a non-page-locked memory.
+    hipErrorLaunchFailure               = 719 ## An exception occurred on the device while executing a kernel.
+    hipErrorCooperativeLaunchTooLarge =
+        720  ## This error indicates that the number of blocks launched per grid for a kernel
+             ## that was launched via cooperative launch APIs exceeds the maximum number of
+             ## allowed blocks for the current device
+    hipErrorNotSupported             = 801 ## Produced when the hip API is not supported/implemented
+    hipErrorStreamCaptureUnsupported = 900 ## The operation is not permitted when the stream
+                                          ## is capturing.
+    hipErrorStreamCaptureInvalidated = 901 ## The current capture sequence on the stream
+                                          ## has been invalidated due to a previous error.
+    hipErrorStreamCaptureMerge = 902 ## The operation would have resulted in a merge of
+                                     ## two independent capture sequences.
+    hipErrorStreamCaptureUnmatched = 903 ## The capture was not initiated in this stream.
+    hipErrorStreamCaptureUnjoined  = 904 ## The capture sequence contains a fork that was not
+                                         ## joined to the primary stream.
+    hipErrorStreamCaptureIsolation = 905 ## A dependency would have been created which crosses
+                                         ## the capture sequence boundary. Only implicit
+                                         ## in-stream ordering dependencies  are allowed
+                                         ## to cross the boundary
+    hipErrorStreamCaptureImplicit = 906 ## The operation would have resulted in a disallowed
+                                        ## implicit dependency on a current capture sequence
+                                        ## from hipStreamLegacy.
+    hipErrorCapturedEvent = 907 ## The operation is not permitted on an event which was last
+                                ## recorded in a capturing stream.
+    hipErrorStreamCaptureWrongThread = 908 ## A stream capture sequence not initiated with
+                                           ## the hipStreamCaptureModeRelaxed argument to
+                                           ## hipStreamBeginCapture was passed to
+                                           ## hipStreamEndCapture in a different thread.
+    hipErrorGraphExecUpdateFailure = 910 ## This error indicates that the graph update
+                                         ## not performed because it included changes which
+                                         ## violated constraintsspecific to instantiated graph
+                                         ## update.
+    hipErrorUnknown = 999 ## Unknown error.
+
+    ## HSA Runtime Error Codes start here.
+    hipErrorRuntimeMemory = 1052 ## HSA runtime memory call returned error.  Typically not seen
+                                 ## in production systems.
+    hipErrorRuntimeOther = 1053  ## HSA runtime call other than memory returned error.  Typically
+                                 ## not seen in production systems.
+    hipErrorTbd  ## Marker that more error codes are needed.
+
+
+  HipDeviceAttribute* {.size: sizeof(cint).} = enum
+    ## hipDeviceAttribute_t
+
+    # hipDeviceAttributeCudaCompatibleBegin = 0
+
+    hipDeviceAttributeCudaCompatibleBegin = 0          ## Whether ECC support is enabled.
+    hipDeviceAttributeAccessPolicyMaxWindowSize        ## Cuda only. The maximum size of the window policy in bytes.
+    hipDeviceAttributeAsyncEngineCount                 ## Asynchronous engines number.
+    hipDeviceAttributeCanMapHostMemory                 ## Whether host memory can be mapped into device address space
+    hipDeviceAttributeCanUseHostPointerForRegisteredMem ## Device can access host registered memory
+                                                        ## at the same virtual address as the CPU
+    hipDeviceAttributeClockRate                        ## Peak clock frequency in kilohertz.
+    hipDeviceAttributeComputeMode                      ## Compute mode that device is currently in.
+    hipDeviceAttributeComputePreemptionSupported       ## Device supports Compute Preemption.
+    hipDeviceAttributeConcurrentKernels                ## Device can possibly execute multiple kernels concurrently.
+    hipDeviceAttributeConcurrentManagedAccess          ## Device can coherently access managed memory concurrently with the CPU
+    hipDeviceAttributeCooperativeLaunch                ## Support cooperative launch
+    hipDeviceAttributeCooperativeMultiDeviceLaunch     ## Support cooperative launch on multiple devices
+    hipDeviceAttributeDeviceOverlap                    ## Device can concurrently copy memory and execute a kernel.
+                                                       ## Deprecated. Use instead asyncEngineCount.
+    hipDeviceAttributeDirectManagedMemAccessFromHost   ## Host can directly access managed memory on
+                                                       ## the device without migration
+    hipDeviceAttributeGlobalL1CacheSupported           ## Device supports caching globals in L1
+    hipDeviceAttributeHostNativeAtomicSupported        ## Link between the device and the host supports native atomic operations
+    hipDeviceAttributeIntegrated                       ## Device is integrated GPU
+    hipDeviceAttributeIsMultiGpuBoard                  ## Multiple GPU devices.
+    hipDeviceAttributeKernelExecTimeout                ## Run time limit for kernels executed on the device
+    hipDeviceAttributeL2CacheSize                      ## Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
+    hipDeviceAttributeLocalL1CacheSupported            ## caching locals in L1 is supported
+    hipDeviceAttributeLuid                             ## 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms
+    hipDeviceAttributeLuidDeviceNodeMask               ## Luid device node mask. Undefined on TCC and non-Windows platforms
+    hipDeviceAttributeComputeCapabilityMajor           ## Major compute capability version number.
+    hipDeviceAttributeManagedMemory                    ## Device supports allocating managed memory on this system
+    hipDeviceAttributeMaxBlocksPerMultiProcessor       ## Max block size per multiprocessor
+    hipDeviceAttributeMaxBlockDimX                     ## Max block size in width.
+    hipDeviceAttributeMaxBlockDimY                     ## Max block size in height.
+    hipDeviceAttributeMaxBlockDimZ                     ## Max block size in depth.
+    hipDeviceAttributeMaxGridDimX                      ## Max grid size  in width.
+    hipDeviceAttributeMaxGridDimY                      ## Max grid size  in height.
+    hipDeviceAttributeMaxGridDimZ                      ## Max grid size  in depth.
+    hipDeviceAttributeMaxSurface1D                     ## Maximum size of 1D surface.
+    hipDeviceAttributeMaxSurface1DLayered              ## Cuda only. Maximum dimensions of 1D layered surface.
+    hipDeviceAttributeMaxSurface2D                     ## Maximum dimension (width height) of 2D surface.
+    hipDeviceAttributeMaxSurface2DLayered              ## Cuda only. Maximum dimensions of 2D layered surface.
+    hipDeviceAttributeMaxSurface3D                     ## Maximum dimension (width height depth) of 3D surface.
+    hipDeviceAttributeMaxSurfaceCubemap                ## Cuda only. Maximum dimensions of Cubemap surface.
+    hipDeviceAttributeMaxSurfaceCubemapLayered         ## Cuda only. Maximum dimension of Cubemap layered surface.
+    hipDeviceAttributeMaxTexture1DWidth                ## Maximum size of 1D texture.
+    hipDeviceAttributeMaxTexture1DLayered              ## Maximum dimensions of 1D layered texture.
+    hipDeviceAttributeMaxTexture1DLinear               ## Maximum number of elements allocatable in a 1D linear texture.
+                                                       ## Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda.
+    hipDeviceAttributeMaxTexture1DMipmap               ## Maximum size of 1D mipmapped texture.
+    hipDeviceAttributeMaxTexture2DWidth                ## Maximum dimension width of 2D texture.
+    hipDeviceAttributeMaxTexture2DHeight               ## Maximum dimension hight of 2D texture.
+    hipDeviceAttributeMaxTexture2DGather               ## Maximum dimensions of 2D texture if gather operations  performed.
+    hipDeviceAttributeMaxTexture2DLayered              ## Maximum dimensions of 2D layered texture.
+    hipDeviceAttributeMaxTexture2DLinear               ## Maximum dimensions (width height pitch) of 2D textures bound to pitched memory.
+    hipDeviceAttributeMaxTexture2DMipmap               ## Maximum dimensions of 2D mipmapped texture.
+    hipDeviceAttributeMaxTexture3DWidth                ## Maximum dimension width of 3D texture.
+    hipDeviceAttributeMaxTexture3DHeight               ## Maximum dimension height of 3D texture.
+    hipDeviceAttributeMaxTexture3DDepth                ## Maximum dimension depth of 3D texture.
+    hipDeviceAttributeMaxTexture3DAlt                  ## Maximum dimensions of alternate 3D texture.
+    hipDeviceAttributeMaxTextureCubemap                ## Maximum dimensions of Cubemap texture
+    hipDeviceAttributeMaxTextureCubemapLayered         ## Maximum dimensions of Cubemap layered texture.
+    hipDeviceAttributeMaxThreadsDim                    ## Maximum dimension of a block
+    hipDeviceAttributeMaxThreadsPerBlock               ## Maximum number of threads per block.
+    hipDeviceAttributeMaxThreadsPerMultiProcessor      ## Maximum resident threads per multiprocessor.
+    hipDeviceAttributeMaxPitch                         ## Maximum pitch in bytes allowed by memory copies
+    hipDeviceAttributeMemoryBusWidth                   ## Global memory bus width in bits.
+    hipDeviceAttributeMemoryClockRate                  ## Peak memory clock frequency in kilohertz.
+    hipDeviceAttributeComputeCapabilityMinor           ## Minor compute capability version number.
+    hipDeviceAttributeMultiGpuBoardGroupID             ## Unique ID of device group on the same multi-GPU board
+    hipDeviceAttributeMultiprocessorCount              ## Number of multiprocessors on the device.
+    hipDeviceAttributeUnused1                          ## Previously hipDeviceAttributeName
+    hipDeviceAttributePageableMemoryAccess             ## Device supports coherently accessing pageable memory
+                                                       ## without calling hipHostRegister on it
+    hipDeviceAttributePageableMemoryAccessUsesHostPageTables ## Device accesses pageable memory via the host's page tables
+    hipDeviceAttributePciBusId                         ## PCI Bus ID.
+    hipDeviceAttributePciDeviceId                      ## PCI Device ID.
+    hipDeviceAttributePciDomainID                      ## PCI Domain ID.
+    hipDeviceAttributePersistingL2CacheMaxSize         ## Maximum l2 persisting lines capacity in bytes
+    hipDeviceAttributeMaxRegistersPerBlock             ## 32-bit registers available to a thread block. This number is shared
+                                                       ## by all thread blocks simultaneously resident on a multiprocessor.
+    hipDeviceAttributeMaxRegistersPerMultiprocessor    ## 32-bit registers available per block.
+    hipDeviceAttributeReservedSharedMemPerBlock        ## Shared memory reserved by CUDA driver per block.
+    hipDeviceAttributeMaxSharedMemoryPerBlock          ## Maximum shared memory available per block in bytes.
+    hipDeviceAttributeSharedMemPerBlockOptin           ## Maximum shared memory per block usable by special opt in.
+    hipDeviceAttributeSharedMemPerMultiprocessor       ## Shared memory available per multiprocessor.
+    hipDeviceAttributeSingleToDoublePrecisionPerfRatio ## Cuda only. Performance ratio of single precision to double precision.
+    hipDeviceAttributeStreamPrioritiesSupported        ## Whether to support stream priorities.
+    hipDeviceAttributeSurfaceAlignment                 ## Alignment requirement for surfaces
+    hipDeviceAttributeTccDriver                        ## Cuda only. Whether device is a Tesla device using TCC driver
+    hipDeviceAttributeTextureAlignment                 ## Alignment requirement for textures
+    hipDeviceAttributeTexturePitchAlignment            ## Pitch alignment requirement for 2D texture references bound to pitched memory;
+    hipDeviceAttributeTotalConstantMemory              ## Constant memory size in bytes.
+    hipDeviceAttributeTotalGlobalMem                   ## Global memory available on devicice.
+    hipDeviceAttributeUnifiedAddressing                ## Cuda only. An unified address space shared with the host.
+    hipDeviceAttributeUnused2                          ## Previously hipDeviceAttributeUuid
+    hipDeviceAttributeWarpSize                         ## Warp size in threads.
+    hipDeviceAttributeMemoryPoolsSupported             ## Device supports HIP Stream Ordered Memory Allocator
+    hipDeviceAttributeVirtualMemoryManagementSupported ## Device supports HIP virtual memory management
+    hipDeviceAttributeHostRegisterSupported            ## Can device support host memory registration via hipHostRegister
+    hipDeviceAttributeCudaCompatibleEnd = 9999
+
+    # hipDeviceAttributeAmdSpecificBegin = 10000
+    # ----------------------------------------------------------------------------
+
+    hipDeviceAttributeClockInstructionRate = 10000             ## Frequency in khz of the timer used by the device-side "clock*"
+    hipDeviceAttributeUnused3                                  ## Previously hipDeviceAttributeArch
+    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor         ## Maximum Shared Memory PerMultiprocessor.
+    hipDeviceAttributeUnused4                                  ## Previously hipDeviceAttributeGcnArch
+    hipDeviceAttributeUnused5                                  ## Previously hipDeviceAttributeGcnArchName
+    hipDeviceAttributeHdpMemFlushCntl                          ## Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
+    hipDeviceAttributeHdpRegFlushCntl                          ## Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc      ## Supports cooperative launch on multiple
+                                                               ## devices with unmatched functions
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim   ## Supports cooperative launch on multiple
+                                                               ## devices with unmatched grid dimensions
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim  ## Supports cooperative launch on multiple
+                                                               ## devices with unmatched block dimensions
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem ## Supports cooperative launch on multiple
+                                                               ## devices with unmatched shared memories
+    hipDeviceAttributeIsLargeBar                               ## Whether it is LargeBar
+    hipDeviceAttributeAsicRevision                             ## Revision of the GPU in this device
+    hipDeviceAttributeCanUseStreamWaitValue                    ## '1' if Device supports hipStreamWaitValue32() and
+                                                               ## hipStreamWaitValue64(), '0' otherwise.
+    hipDeviceAttributeImageSupport                             ## '1' if Device supports image, '0' otherwise.
+    hipDeviceAttributePhysicalMultiProcessorCount              ## All available physical compute
+                                                               ## units for the device
+    hipDeviceAttributeFineGrainSupport                         ## '1' if Device supports fine grain, '0' otherwise
+    hipDeviceAttributeWallClockRate                            ## Constant frequency of wall clock in kilohertz.
+
+    hipDeviceAttributeAmdSpecificEnd = 19999
+    hipDeviceAttributeVendorSpecificBegin = 20000
+    ## Extended attributes for vendors
+
+  HipMemAttach_flags* = enum
+    hipMemAttachGlobal = 0x1, ## Memory can be accessed by any stream on any device
+    hipMemAttachHost = 0x2,   ## Memory cannot be accessed by any stream on any device
+    hipMemAttachSingle = 0x4
+
+  HipDevice* = distinct int32      ## Hip Compute Device handle
+  HipContext* = distinct pointer   # hipCtx_t
+  HipModule* = distinct pointer    # hipModule_t
+  HipFunction* = distinct pointer  # hipFunction_t
+  HipStream* = distinct pointer    # hipStream_t
+  HipDeviceptr* = distinct pointer # HipDeviceptr_t
+    ## A pointer to data on the Hip device
+
+type
+  HipDeviceProp* {.bycopy.} = object
+    # Generated via c2nim from HIP v6.0.2
+    # The ABI seems forward compatible with reserved bytes
+    #
+    # We don't import the `hip_runtime_api.h` header
+    # for one less dependency during deployment,
+    # especially given than some distributions like Ubuntu
+    # split between a dev (with headers) and regular package
+    # and Windows path management is cumbersome.
+
+    # Note the macro
+    # #define hipDeviceProp_t hipDeviceProp_tR0600
+
+    name*: array[256, char]
+    ## Device name.
+    uuid*: hipUUID
+    ## UUID of a device
+    luid*: array[8, byte]
+    ## 8-byte unique identifier. Only valid on windows
+    luidDeviceNodeMask*: cuint
+    ## LUID node mask
+    totalGlobalMem*: csize_t
+    ## Size of global memory region (in bytes).
+    sharedMemPerBlock*: csize_t
+    ## Size of shared memory region (in bytes).
+    regsPerBlock*: cint
+    ## Registers per block.
+    warpSize*: cint
+    ## Warp size.
+    memPitch*: csize_t
+    ## Maximum pitch in bytes allowed by memory copies
+    ## pitched memory
+    maxThreadsPerBlock*: cint
+    ## Max work items per work group or workgroup max size.
+    maxThreadsDim*: array[3, cint]
+    ## Max number of threads in each dimension (XYZ) of a block.
+    maxGridSize*: array[3, cint]
+    ## Max grid dimensions (XYZ).
+    clockRate*: cint
+    ## Max clock frequency of the multiProcessors in khz.
+    totalConstMem*: csize_t
+    ## Size of shared memory region (in bytes).
+    major*: cint
+    ## Major compute capability.  On HCC, this is an approximation and features may
+    ## differ from CUDA CC.  See the arch feature flags for portable ways to query
+    ## feature caps.
+    minor*: cint
+    ## Minor compute capability.  On HCC, this is an approximation and features may
+    ## differ from CUDA CC.  See the arch feature flags for portable ways to query
+    ## feature caps.
+    textureAlignment*: csize_t
+    ## Alignment requirement for textures
+    texturePitchAlignment*: csize_t
+    ## Pitch alignment requirement for texture references bound to
+    deviceOverlap*: cint
+    ## Deprecated. Use asyncEngineCount instead
+    multiProcessorCount*: cint
+    ## Number of multi-processors (compute units).
+    kernelExecTimeoutEnabled*: cint
+    ## Run time limit for kernels executed on the device
+    integrated*: cint
+    ## APU vs dGPU
+    canMapHostMemory*: cint
+    ## Check whether HIP can map host memory
+    computeMode*: cint
+    ## Compute mode.
+    maxTexture1D*: cint
+    ## Maximum number of elements in 1D images
+    maxTexture1DMipmap*: cint
+    ## Maximum 1D mipmap texture size
+    maxTexture1DLinear*: cint
+    ## Maximum size for 1D textures bound to linear memory
+    maxTexture2D*: array[2, cint]
+    ## Maximum dimensions (width, height) of 2D images, in image elements
+    maxTexture2DMipmap*: array[2, cint]
+    ## Maximum number of elements in 2D array mipmap of images
+    maxTexture2DLinear*: array[3, cint]
+    ## Maximum 2D tex dimensions if tex are bound to pitched memory
+    maxTexture2DGather*: array[2, cint]
+    ## Maximum 2D tex dimensions if gather has to be performed
+    maxTexture3D*: array[3, cint]
+    ## Maximum dimensions (width, height, depth) of 3D images, in image
+    ## elements
+    maxTexture3DAlt*: array[3, cint]
+    ## Maximum alternate 3D texture dims
+    maxTextureCubemap*: cint
+    ## Maximum cubemap texture dims
+    maxTexture1DLayered*: array[2, cint]
+    ## Maximum number of elements in 1D array images
+    maxTexture2DLayered*: array[3, cint]
+    ## Maximum number of elements in 2D array images
+    maxTextureCubemapLayered*: array[2, cint]
+    ## Maximum cubemaps layered texture dims
+    maxSurface1D*: cint
+    ## Maximum 1D surface size
+    maxSurface2D*: array[2, cint]
+    ## Maximum 2D surface size
+    maxSurface3D*: array[3, cint]
+    ## Maximum 3D surface size
+    maxSurface1DLayered*: array[2, cint]
+    ## Maximum 1D layered surface size
+    maxSurface2DLayered*: array[3, cint]
+    ## Maximum 2D layared surface size
+    maxSurfaceCubemap*: cint
+    ## Maximum cubemap surface size
+    maxSurfaceCubemapLayered*: array[2, cint]
+    ## Maximum cubemap layered surface size
+    surfaceAlignment*: csize_t
+    ## Alignment requirement for surface
+    concurrentKernels*: cint
+    ## Device can possibly execute multiple kernels concurrently.
+    ECCEnabled*: cint
+    ## Device has ECC support enabled
+    pciBusID*: cint
+    ## PCI Bus ID.
+    pciDeviceID*: cint
+    ## PCI Device ID.
+    pciDomainID*: cint
+    ## PCI Domain ID
+    tccDriver*: cint
+    ## 1:If device is Tesla device using TCC driver, else 0
+    asyncEngineCount*: cint
+    ## Number of async engines
+    unifiedAddressing*: cint
+    ## Does device and host share unified address space
+    memoryClockRate*: cint
+    ## Max global memory clock frequency in khz.
+    memoryBusWidth*: cint
+    ## Global memory bus width in bits.
+    l2CacheSize*: cint
+    ## L2 cache size.
+    persistingL2CacheMaxSize*: cint
+    ## Device's max L2 persisting lines in bytes
+    maxThreadsPerMultiProcessor*: cint
+    ## Maximum resident threads per multi-processor.
+    streamPrioritiesSupported*: cint
+    ## Device supports stream priority
+    globalL1CacheSupported*: cint
+    ## Indicates globals are cached in L1
+    localL1CacheSupported*: cint
+    ## Locals are cahced in L1
+    sharedMemPerMultiprocessor*: csize_t
+    ## Amount of shared memory available per multiprocessor.
+    regsPerMultiprocessor*: cint
+    ## registers available per multiprocessor
+    managedMemory*: cint
+    ## Device supports allocating managed memory on this system
+    isMultiGpuBoard*: cint
+    ## 1 if device is on a multi-GPU board, 0 if not.
+    multiGpuBoardGroupID*: cint
+    ## Unique identifier for a group of devices on same multiboard GPU
+    hostNativeAtomicSupported*: cint
+    ## Link between host and device supports native atomics
+    singleToDoublePrecisionPerfRatio*: cint
+    ## Deprecated. CUDA only.
+    pageableMemoryAccess*: cint
+    ## Device supports coherently accessing pageable memory
+    ## without calling hipHostRegister on it
+    concurrentManagedAccess*: cint
+    ## Device can coherently access managed memory concurrently with
+    ## the CPU
+    computePreemptionSupported*: cint
+    ## Is compute preemption supported on the device
+    canUseHostPointerForRegisteredMem*: cint
+    ## Device can access host registered memory with same
+    ## address as the host
+    cooperativeLaunch*: cint
+    ## HIP device supports cooperative launch
+    cooperativeMultiDeviceLaunch*: cint
+    ## HIP device supports cooperative launch on multiple
+    ## devices
+    sharedMemPerBlockOptin*: csize_t
+    ## Per device m ax shared mem per block usable by special opt in
+    pageableMemoryAccessUsesHostPageTables*: cint
+    ## Device accesses pageable memory via the host's
+    ## page tables
+    directManagedMemAccessFromHost*: cint
+    ## Host can directly access managed memory on the device
+    ## without migration
+    maxBlocksPerMultiProcessor*: cint
+    ## Max number of blocks on CU
+    accessPolicyMaxWindowSize*: cint
+    ## Max value of access policy window
+    reservedSharedMemPerBlock*: csize_t
+    ## Shared memory reserved by driver per block
+    hostRegisterSupported*: cint
+    ## Device supports hipHostRegister
+    sparseHipArraySupported*: cint
+    ## Indicates if device supports sparse hip arrays
+    hostRegisterReadOnlySupported*: cint
+    ## Device supports using the hipHostRegisterReadOnly flag
+    ## with hipHostRegistger
+    timelineSemaphoreInteropSupported*: cint
+    ## Indicates external timeline semaphore support
+    memoryPoolsSupported*: cint
+    ## Indicates if device supports hipMallocAsync and hipMemPool APIs
+    gpuDirectRDMASupported*: cint
+    ## Indicates device support of RDMA APIs
+    gpuDirectRDMAFlushWritesOptions*: cuint
+    ## Bitmask to be interpreted according to
+    ## hipFlushGPUDirectRDMAWritesOptions
+    gpuDirectRDMAWritesOrdering*: cint
+    ## value of hipGPUDirectRDMAWritesOrdering
+    memoryPoolSupportedHandleTypes*: cuint
+    ## Bitmask of handle types support with mempool based IPC
+    deferredMappingHipArraySupported*: cint
+    ## Device supports deferred mapping HIP arrays and HIP
+    ## mipmapped arrays
+    ipcEventSupported*: cint
+    ## Device supports IPC events
+    clusterLaunch*: cint
+    ## Device supports cluster launch
+    unifiedFunctionPointers*: cint
+    ## Indicates device supports unified function pointers
+    reserved*: array[63, cint]
+    ## CUDA Reserved.
+    hipReserved*: array[32, cint]
+    ## Reserved for adding new entries for HIP/CUDA.
+    ##  HIP Only struct members
+    gcnArchName*: array[256, char]
+    ## AMD GCN Arch Name. HIP Only.
+    maxSharedMemoryPerMultiProcessor*: csize_t
+    ## Maximum Shared Memory Per CU. HIP Only.
+    clockInstructionRate*: cint
+    ## Frequency in khz of the timer used by the device-side "clock*"
+    ## instructions.  New for HIP.
+    arch*: HipDeviceArch
+    ## Architectural feature flags.  New for HIP.
+    hdpMemFlushCntl*: ptr cuint
+    ## Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
+    hdpRegFlushCntl*: ptr cuint
+    ## Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
+    cooperativeMultiDeviceUnmatchedFunc*: cint
+    ## HIP device supports cooperative launch on
+    ## multiple
+    ##  devices with unmatched functions
+    cooperativeMultiDeviceUnmatchedGridDim*: cint
+    ## HIP device supports cooperative launch on
+    ## multiple
+    ##  devices with unmatched grid dimensions
+    cooperativeMultiDeviceUnmatchedBlockDim*: cint
+    ## HIP device supports cooperative launch on
+    ## multiple
+    ##  devices with unmatched block dimensions
+    cooperativeMultiDeviceUnmatchedSharedMem*: cint
+    ## HIP device supports cooperative launch on
+    ## multiple
+    ##  devices with unmatched shared memories
+    isLargeBar*: cint
+    ## 1: if it is a large PCI bar device, else 0
+    asicRevision*: cint
+    ## Revision of the GPU in this device
+
+  hipUUID* {.bycopy.} = object
+    bytes*: array[16, byte]
+
+  HipDeviceArch* {.bycopy.} = object
+    ##  32-bit Atomics
+    hasGlobalInt32Atomics* {.bitsize: 1.}: cuint
+    ## 32-bit integer atomics for global memory.
+    hasGlobalFloatAtomicExch* {.bitsize: 1.}: cuint
+    ## 32-bit float atomic exch for global memory.
+    hasSharedInt32Atomics* {.bitsize: 1.}: cuint
+    ## 32-bit integer atomics for shared memory.
+    hasSharedFloatAtomicExch* {.bitsize: 1.}: cuint
+    ## 32-bit float atomic exch for shared memory.
+    hasFloatAtomicAdd* {.bitsize: 1.}: cuint
+    ## 32-bit float atomic add in global and shared memory.
+    ##  64-bit Atomics
+    hasGlobalInt64Atomics* {.bitsize: 1.}: cuint
+    ## 64-bit integer atomics for global memory.
+    hasSharedInt64Atomics* {.bitsize: 1.}: cuint
+    ## 64-bit integer atomics for shared memory.
+    ##  Doubles
+    hasDoubles* {.bitsize: 1.}: cuint
+    ## Double-precision floating point.
+    ##  Warp cross-lane operations
+    hasWarpVote* {.bitsize: 1.}: cuint
+    ## Warp vote instructions (__any, __all).
+    hasWarpBallot* {.bitsize: 1.}: cuint
+    ## Warp ballot instructions (__ballot).
+    hasWarpShuffle* {.bitsize: 1.}: cuint
+    ## Warp shuffle operations. (__shfl_*).
+    hasFunnelShift* {.bitsize: 1.}: cuint
+    ## Funnel two words into one with shift&mask caps.
+    ##  Sync
+    hasThreadFenceSystem* {.bitsize: 1.}: cuint
+    ## __threadfence_system.
+    hasSyncThreadsExt* {.bitsize: 1.}: cuint
+    ## __syncthreads_count, syncthreads_and, syncthreads_or.
+    ##  Misc
+    hasSurfaceFuncs* {.bitsize: 1.}: cuint
+    ## Surface functions.
+    has3dGrid* {.bitsize: 1.}: cuint
+    ## Grid and group dims are 3D (rather than 2D).
+    hasDynamicParallelism* {.bitsize: 1.}: cuint
+    ## Dynamic parallelism.
+
+proc hipGetDeviceProperties*(prop: var HipDeviceProp, ordinal: int32): HipError {.
+  noconv, importc: "hipGetDevicePropertiesR0600", dynlib: libAmdHip.}
+  # Note the macro
+  # `#define hipGetDeviceProperties hipGetDevicePropertiesR0600`
+
+{.push noconv, importc, dynlib: libAmdHip.}
+
+proc hipInit*(flags: uint32): HipError
+
+proc hipGetDeviceCount*(count: var int32): HipError
+proc hipDeviceGet*(device: var HipDevice, ordinal: int32): HipError
+proc hipDeviceGetName*(name: ptr char, len: int32, dev: HipDevice): HipError
+proc hipDeviceGetAttribute*(r: var int32, attrib: HipDeviceAttribute, dev: HipDevice): HipError
+
+proc hipCtxCreate*(ctx: var HipContext, flags: uint32, dev: HipDevice): HipError
+proc hipCtxDestroy*(ctx: HipContext): HipError
+proc hipCtxSynchronize*(ctx: HipContext): HipError
+
+proc hipModuleLoadData(module: var HipModule, code_object: pointer): HipError {.used.}
+proc hipModuleUnload*(module: HipModule): HipError
+proc hipModuleGetFunction(kernel: var HipFunction, module: HipModule, fnName: ptr char): HipError {.used.}
+
+proc hipModuleLaunchKernel(
+       kernel: HipFunction,
+       gridDimX, gridDimY, gridDimZ: uint32,
+       blockDimX, blockDimY, blockDimZ: uint32,
+       sharedMemBytes: uint32,
+       stream: HipStream,
+       kernelParams: ptr pointer,
+       extra: ptr pointer
+     ): HipError {.used.}
+
+proc hipMalloc*(devptr: var HipDeviceptr, size: csize_t): HipError
+proc hipMallocManaged*(devptr: var HipDeviceptr, size: csize_t, flags: Flag[HipMemAttach_flags]): HipError
+proc hipFree*(devptr: HipDeviceptr): HipError
+proc hipMemcpyHtoD*(dst: HipDeviceptr, src: pointer, size: csize_t): HipError
+proc hipMemcpyDtoH*(dst: pointer, src: HipDeviceptr, size: csize_t): HipError
+
+{.pop.} # {.push importc, dynlib: "libamdhip64.so".}
+
+# ------------------------------------------------------------------------------
+# Sanity check
+
+when isMainModule:
+
+  template check*(status: HipError) =
+    ## Check the status code of a Hip operation
+    ## Exit program with error if failure
+
+    let code = status # ensure that the input expression is evaluated once only
+    if code != hipSuccess:
+      writeStackTrace()
+      stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
+      quit 1
+
+  proc main*() =
+    var props: HipDeviceProp
+    var device: cint = 0
+    check hipGetDeviceProperties(props, device)
+    echo "warpSize: ", props.warpSize
+    echo "GCN Architecture: ", props.gcnArchName
+
+  main()
diff --git a/constantine/platforms/llvm/bindings/c_abi.nim b/constantine/platforms/abis/c_abi.nim
similarity index 99%
rename from constantine/platforms/llvm/bindings/c_abi.nim
rename to constantine/platforms/abis/c_abi.nim
index 821b84eda..7e16291f1 100644
--- a/constantine/platforms/llvm/bindings/c_abi.nim
+++ b/constantine/platforms/abis/c_abi.nim
@@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import std/macros
-import ../../ast_rebuilder
+import constantine/platforms/ast_rebuilder
 
 # ############################################################
 #
diff --git a/constantine/platforms/llvm/bindings/llvm_abi.nim b/constantine/platforms/abis/llvm_abi.nim
similarity index 73%
rename from constantine/platforms/llvm/bindings/llvm_abi.nim
rename to constantine/platforms/abis/llvm_abi.nim
index 981bcb9ac..7fcf342fb 100644
--- a/constantine/platforms/llvm/bindings/llvm_abi.nim
+++ b/constantine/platforms/abis/llvm_abi.nim
@@ -8,10 +8,7 @@
 
 import ./c_abi
 
-{.passc: gorge("llvm-config --cflags").}
-{.passl: gorge("llvm-config --libs").}
-const libLLVM = gorge("llvm-config --libnames")
-
+const libLLVM = "libLLVM-(16|17|18).so"
 static: echo "[Constantine] Using library " & libLLVM
 
 # ############################################################
@@ -133,6 +130,11 @@ proc initializeNVPTXAsmPrinter() {.importc: "LLVMInitializeNVPTXAsmPrinter".}
 proc initializeNVPTXTarget() {.importc: "LLVMInitializeNVPTXTarget".}
 proc initializeNVPTXTargetInfo() {.importc: "LLVMInitializeNVPTXTargetInfo".}
 proc initializeNVPTXTargetMC() {.importc: "LLVMInitializeNVPTXTargetMC".}
+
+proc initializeAMDGPUAsmPrinter() {.importc: "LLVMInitializeAMDGPUAsmPrinter".}
+proc initializeAMDGPUTarget() {.importc: "LLVMInitializeAMDGPUTarget".}
+proc initializeAMDGPUTargetInfo() {.importc: "LLVMInitializeAMDGPUTargetInfo".}
+proc initializeAMDGPUTargetMC() {.importc: "LLVMInitializeAMDGPUTargetMC".}
 {.pop.}
 
 proc getTargetFromName*(name: cstring): TargetRef {.importc: "LLVMGetTargetFromName".}
@@ -287,6 +289,254 @@ proc getElementType*(arrayOrVectorTy: TypeRef): TypeRef {.importc: "LLVMGetEleme
 
 # Functions
 # ------------------------------------------------------------
+type
+  CallingConvention {.size: sizeof(cuint).} = enum
+    # The default llvm calling convention, compatible with C. This convention
+    # is the only one that supports varargs calls. As with typical C calling
+    # conventions, the callee/caller have to tolerate certain amounts of
+    # prototype mismatch.
+    C = 0,
+
+    # Generic LLVM calling conventions. None of these support varargs calls,
+    # and all assume that the caller and callee prototype exactly match.
+
+    # Attempts to make calls as fast as possible (e.g. by passing things in
+    # registers).
+    Fast = 8,
+
+    # Attempts to make code in the caller as efficient as possible under the
+    # assumption that the call is not commonly executed. As such, these calls
+    # often preserve all registers so that the call does not break any live
+    # ranges in the caller side.
+    Cold = 9,
+
+    # Used by the Glasgow Haskell Compiler (GHC).
+    GHC = 10,
+
+    # Used by the High-Performance Erlang Compiler (HiPE).
+    HiPE = 11,
+
+    # OBSOLETED - Used for stack based JavaScript calls
+    # WebKit_JS = 12,
+
+    # Used for dynamic register based calls (e.g. stackmap and patchpoint
+    # intrinsics).
+    AnyReg = 13,
+
+    # Used for runtime calls that preserves most registers.
+    PreserveMost = 14,
+
+    # Used for runtime calls that preserves (almost) all registers.
+    PreserveAll = 15,
+
+    # Calling convention for Swift.
+    Swift = 16,
+
+    # Used for access functions.
+    CXX_FAST_TLS = 17,
+
+    # Attemps to make calls as fast as possible while guaranteeing that tail
+    # call optimization can always be performed.
+    Tail = 18,
+
+    # Special calling convention on Windows for calling the Control Guard
+    # Check ICall funtion. The function takes exactly one argument (address of
+    # the target function) passed in the first argument register, and has no
+    # return value. All register values are preserved.
+    CFGuard_Check = 19,
+
+    # This follows the Swift calling convention in how arguments are passed
+    # but guarantees tail calls will be made by making the callee clean up
+    # their stack.
+    SwiftTail = 20,
+
+    # Used for runtime calls that preserves none general registers.
+    PreserveNone = 21,
+
+    # This is the start of the target-specific calling conventions, e.g.
+    # fastcall and thiscall on X86.
+    # FirstTargetCC = 64,
+
+    # stdcall is mostly used by the Win32 API. It is basically the same as the
+    # C convention with the difference in that the callee is responsible for
+    # popping the arguments from the stack.
+    X86_StdCall = 64,
+
+    # 'fast' analog of X86_StdCall. Passes first two arguments in ECX:EDX
+    # registers, others - via stack. Callee is responsible for stack cleaning.
+    X86_FastCall = 65,
+
+    # ARM Procedure Calling Standard (obsolete, but still used on some
+    # targets).
+    ARM_APCS = 66,
+
+    # ARM Architecture Procedure Calling Standard calling convention (aka
+    # EABI). Soft float variant.
+    ARM_AAPCS = 67,
+
+    # Same as ARM_AAPCS, but uses hard floating point ABI.
+    ARM_AAPCS_VFP = 68,
+
+    # Used for MSP430 interrupt routines.
+    MSP430_INTR = 69,
+
+    # Similar to X86_StdCall. Passes first argument in ECX, others via stack.
+    # Callee is responsible for stack cleaning. MSVC uses this by default for
+    # methods in its ABI.
+    X86_ThisCall = 70,
+
+    # Call to a PTX kernel. Passes all arguments in parameter space.
+    PTX_Kernel = 71,
+
+    # Call to a PTX device function. Passes all arguments in register or
+    # parameter space.
+    PTX_Device = 72,
+
+    # Used for SPIR non-kernel device functions. No lowering or expansion of
+    # arguments. Structures are passed as a pointer to a struct with the
+    # byval attribute. Functions can only call SPIR_FUNC and SPIR_KERNEL
+    # functions. Functions can only have zero or one return values. Variable
+    # arguments are not allowed, except for printf. How arguments/return
+    # values are lowered are not specified. Functions are only visible to the
+    # devices.
+    SPIR_FUNC = 75,
+
+    # Used for SPIR kernel functions. Inherits the restrictions of SPIR_FUNC,
+    # except it cannot have non-void return values, it cannot have variable
+    # arguments, it can also be called by the host or it is externally
+    # visible.
+    SPIR_KERNEL = 76,
+
+    # Used for Intel OpenCL built-ins.
+    Intel_OCL_BI = 77,
+
+    # The C convention as specified in the x86-64 supplement to the System V
+    # ABI, used on most non-Windows systems.
+    X86_64_SysV = 78,
+
+    # The C convention as implemented on Windows/x86-64 and AArch64. It
+    # differs from the more common \c X86_64_SysV convention in a number of
+    # ways, most notably in that XMM registers used to pass arguments are
+    # shadowed by GPRs, and vice versa. On AArch64, this is identical to the
+    # normal C (AAPCS) calling convention for normal functions, but floats are
+    # passed in integer registers to variadic functions.
+    Win64 = 79,
+
+    # MSVC calling convention that passes vectors and vector aggregates in SSE
+    # registers.
+    X86_VectorCall = 80,
+
+    # Placeholders for HHVM calling conventions (deprecated, removed).
+    DUMMY_HHVM = 81,
+    DUMMY_HHVM_C = 82,
+
+    # x86 hardware interrupt context. Callee may take one or two parameters,
+    # where the 1st represents a pointer to hardware context frame and the 2nd
+    # represents hardware error code, the presence of the later depends on the
+    # interrupt vector taken. Valid for both 32- and 64-bit subtargets.
+    X86_INTR = 83,
+
+    # Used for AVR interrupt routines.
+    AVR_INTR = 84,
+
+    # Used for AVR signal routines.
+    AVR_SIGNAL = 85,
+
+    # Used for special AVR rtlib functions which have an "optimized"
+    # convention to preserve registers.
+    AVR_BUILTIN = 86,
+
+    # Used for Mesa vertex shaders, or AMDPAL last shader stage before
+    # rasterization (vertex shader if tessellation and geometry are not in
+    # use, or otherwise copy shader if one is needed).
+    AMDGPU_VS = 87,
+
+    # Used for Mesa/AMDPAL geometry shaders.
+    AMDGPU_GS = 88,
+
+    # Used for Mesa/AMDPAL pixel shaders.
+    AMDGPU_PS = 89,
+
+    # Used for Mesa/AMDPAL compute shaders.
+    AMDGPU_CS = 90,
+
+    # Used for AMDGPU code object kernels.
+    AMDGPU_KERNEL = 91,
+
+    # Register calling convention used for parameters transfer optimization
+    X86_RegCall = 92,
+
+    # Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
+    AMDGPU_HS = 93,
+
+    # Used for special MSP430 rtlib functions which have an "optimized"
+    # convention using additional registers.
+    MSP430_BUILTIN = 94,
+
+    # Used for AMDPAL vertex shader if tessellation is in use.
+    AMDGPU_LS = 95,
+
+    # Used for AMDPAL shader stage before geometry shader if geometry is in
+    # use. So either the domain (= tessellation evaluation) shader if
+    # tessellation is in use, or otherwise the vertex shader.
+    AMDGPU_ES = 96,
+
+    # Used between AArch64 Advanced SIMD functions
+    AArch64_VectorCall = 97,
+
+    # Used between AArch64 SVE functions
+    AArch64_SVE_VectorCall = 98,
+
+    # For emscripten __invoke_* functions. The first argument is required to
+    # be the function ptr being indirectly called. The remainder matches the
+    # regular calling convention.
+    WASM_EmscriptenInvoke = 99,
+
+    # Used for AMD graphics targets.
+    AMDGPU_Gfx = 100,
+
+    # Used for M68k interrupt routines.
+    M68k_INTR = 101,
+
+    # Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
+    AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 = 102,
+
+    # Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
+    AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 = 103,
+
+    # Used on AMDGPUs to give the middle-end more control over argument
+    # placement.
+    AMDGPU_CS_Chain = 104,
+
+    # Used on AMDGPUs to give the middle-end more control over argument
+    # placement. Preserves active lane values for input VGPRs.
+    AMDGPU_CS_ChainPreserve = 105,
+
+    # Used for M68k rtd-based CC (similar to X86's stdcall).
+    M68k_RTD = 106,
+
+    # Used by GraalVM. Two additional registers are reserved.
+    GRAAL = 107,
+
+    # Calling convention used in the ARM64EC ABI to implement calls between
+    # x64 code and thunks. This is basically the x64 calling convention using
+    # ARM64 register names. The first parameter is mapped to x9.
+    ARM64EC_Thunk_X64 = 108,
+
+    # Calling convention used in the ARM64EC ABI to implement calls between
+    # ARM64 code and thunks. This is just the ARM64 calling convention,
+    # except that the first parameter is mapped to x9.
+    ARM64EC_Thunk_Native = 109,
+
+    # Calling convention used for RISC-V V-extension.
+    RISCV_VectorCall = 110,
+
+    # Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
+    AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 = 111,
+
+    # The highest possible ID. Must be some 2^k - 1.
+    MaxID = 1023
+
 proc function_t*(
        returnType: TypeRef,
        paramTypes: openArray[TypeRef],
@@ -299,6 +549,9 @@ proc addFunction*(m: ModuleRef, name: cstring, ty: TypeRef): ValueRef {.importc:
 proc getReturnType*(functionTy: TypeRef): TypeRef {.importc: "LLVMGetReturnType".}
 proc countParamTypes*(functionTy: TypeRef): uint32 {.importc: "LLVMCountParamTypes".}
 
+proc getCallingConvention*(function: ValueRef): CallingConvention {.importc: "LLVMGetFunctionCallConv".}
+proc setCallingConvention*(function: ValueRef, cc: CallingConvention) {.importc: "LLVMSetFunctionCallConv".}
+
 # ############################################################
 #
 #                         Values
diff --git a/constantine/platforms/llvm/bindings/nvidia_abi.nim b/constantine/platforms/abis/nvidia_abi.nim
similarity index 99%
rename from constantine/platforms/llvm/bindings/nvidia_abi.nim
rename to constantine/platforms/abis/nvidia_abi.nim
index ae37ed713..02dd8808f 100644
--- a/constantine/platforms/llvm/bindings/nvidia_abi.nim
+++ b/constantine/platforms/abis/nvidia_abi.nim
@@ -20,8 +20,9 @@ import ./c_abi
 #
 # ############################################################
 
-static: echo "[Constantine] Using library libcuda.so"
-{.passl: "-L/opt/cuda/lib64 -lcuda".}
+const libPath = "/opt/cuda/lib64/" # For now, only support Linux
+static: echo "[Constantine] Will search Cuda runtime in $LD_LIBRARY_PATH and " & libPath & "libcuda.so"
+const libCuda = "(libcuda.so|" & libPath & "libcuda.so)"
 
 # Cuda offers 2 APIs:
 # - cuda.h               the driver API
@@ -482,7 +483,7 @@ type
   CUstream* = distinct pointer
   CUdeviceptr* = distinct pointer
 
-{.push noconv, importc, dynlib: "libcuda.so".}
+{.push noconv, importc, dynlib: libCuda.}
 
 proc cuInit*(flags: uint32): CUresult
 
diff --git a/constantine/platforms/llvm/llvm.nim b/constantine/platforms/llvm/llvm.nim
index c064897bf..d222a306d 100644
--- a/constantine/platforms/llvm/llvm.nim
+++ b/constantine/platforms/llvm/llvm.nim
@@ -6,7 +6,7 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import ./bindings/llvm_abi {.all.}
+import constantine/platforms/abis/llvm_abi {.all.}
 export llvm_abi
 
 # ############################################################
@@ -48,7 +48,7 @@ proc toBitcode*(m: ModuleRef): seq[byte] =
   copyMem(result[0].addr, mb.getBufferStart(), len)
   mb.dispose()
 
-template verify*(module: ModuleRef, failureAction: VerifierFailureAction) =
+proc verify*(module: ModuleRef, failureAction: VerifierFailureAction) =
   ## Verify the IR code in a module
   var errMsg: LLVMstring
   let err = bool verify(module, failureAction, errMsg)
@@ -68,7 +68,7 @@ proc getIdentifier*(module: ModuleRef): string =
 # Target
 # ------------------------------------------------------------
 
-template toTarget*(triple: cstring): TargetRef =
+proc toTarget*(triple: cstring): TargetRef =
   var target: TargetRef
   var errMsg: LLVMstring
   let err = bool triple.getTargetFromTriple(target, errMsg)
@@ -94,10 +94,16 @@ proc initializeFullNVPTXTarget* {.inline.} =
   initializeNVPTXTargetMC()
   initializeNVPTXAsmPrinter()
 
+proc initializeFullAMDGPUTarget* {.inline.} =
+  initializeAMDGPUTargetInfo()
+  initializeAMDGPUTarget()
+  initializeAMDGPUTargetMC()
+  initializeAMDGPUAsmPrinter()
+
 # Execution Engine
 # ------------------------------------------------------------
 
-template createJITCompilerForModule*(
+proc createJITCompilerForModule*(
        engine: var ExecutionEngineRef,
        module: ModuleRef,
        optLevel: uint32) =
@@ -112,7 +118,7 @@ template createJITCompilerForModule*(
 # Target Machine
 # ------------------------------------------------------------
 
-template emitToFile*(t: TargetMachineRef, m: ModuleRef,
+proc emitToFile*(t: TargetMachineRef, m: ModuleRef,
                  fileName: string, codegen: CodeGenFileType) =
   var errMsg: LLVMstring
   let err = bool targetMachineEmitToFile(t, m, cstring(fileName), codegen, errMsg)
@@ -122,7 +128,7 @@ template emitToFile*(t: TargetMachineRef, m: ModuleRef,
     errMsg.dispose()
     quit 1
 
-template emitToString*(t: TargetMachineRef, m: ModuleRef, codegen: CodeGenFileType): string =
+proc emitTo*[T: string or seq[byte]](t: TargetMachineRef, m: ModuleRef, codegen: CodeGenFileType): T =
   ## Codegen to string
   var errMsg: LLVMstring
   var mb: MemoryBufferRef
@@ -133,10 +139,9 @@ template emitToString*(t: TargetMachineRef, m: ModuleRef, codegen: CodeGenFileTy
     errMsg.dispose()
     quit 1
   let len = mb.getBufferSize()
-  var emitted = newString(len)
-  copyMem(emitted[0].addr, mb.getBufferStart(), len)
+  result.setLen(len)
+  copyMem(result[0].addr, mb.getBufferStart(), len)
   mb.dispose()
-  emitted
 
 # Builder
 # ------------------------------------------------------------
diff --git a/research/codegen/x86_instr.nim b/research/codegen/x86_instr.nim
index 1e6211df1..a4a192194 100644
--- a/research/codegen/x86_instr.nim
+++ b/research/codegen/x86_instr.nim
@@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  constantine/platforms/llvm/bindings/c_abi,
+  constantine/platforms/abis/c_abi,
   constantine/platforms/llvm/llvm,
   constantine/platforms/primitives,
   constantine/math_compiler/ir,
diff --git a/research/codegen/x86_poc.nim b/research/codegen/x86_poc.nim
index 307826158..c5c376fe4 100644
--- a/research/codegen/x86_poc.nim
+++ b/research/codegen/x86_poc.nim
@@ -156,7 +156,7 @@ when isMainModule:
   echo "========================================="
   echo "Assembly\n"
 
-  echo machine.emitToString(asy.module, AssemblyFile)
+  echo machine.emitTo[:string](asy.module, AssemblyFile)
   echo "========================================="
 
   # Output
diff --git a/tests/gpu/hello_world_amdgpu.nim b/tests/gpu/hello_world_amdgpu.nim
new file mode 100644
index 000000000..c0f57d23c
--- /dev/null
+++ b/tests/gpu/hello_world_amdgpu.nim
@@ -0,0 +1,211 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/llvm/llvm,
+  constantine/math_compiler/codegen_amdgpu
+
+echo "AMD GPU JIT compiler Hello World"
+
+# Docs:
+# - https://rocm.docs.amd.com/projects/llvm-project/en/latest/reference/rocmcc.html
+# - https://llvm.org/docs/AMDGPUUsage.html
+
+
+proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKernelName: string) =
+
+  # ######################################
+  # Metadata
+
+  const triple = "amdgcn-amd-amdhsa"
+
+  const datalayout1 {.used.} =
+      "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-"               &
+             "i64:64-"                                                                 &
+             "v16:16-v24:32-"                                                          &
+             "v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-" &
+             "n32:64-S32-A5-G1-ni:7"
+
+  const datalayout2 =
+      "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-" &
+             "i64:64-"                                                                                &
+             "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-"  &
+             "n32:64-S32-A5-G1-ni:7:8"
+
+
+  # ######################################
+  # LLVM IR codegen
+
+  module.setTarget(triple)
+  module.setDataLayout(datalayout2)
+  let i128 = ctx.int128_t()
+  let void_t = ctx.void_t()
+
+  let builder = ctx.createBuilder()
+  defer: builder.dispose()
+
+  block:
+    let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
+    let addKernel = module.addFunction(addKernelName, addType)
+    let blck = ctx.appendBasicBlock(addKernel, "addBody")
+    builder.positionAtEnd(blck)
+    let r = addKernel.getParam(0)
+    let a = addKernel.getParam(1)
+    let b = addKernel.getParam(2)
+    let sum = builder.add(a, b, "sum")
+    builder.store(sum, r)
+    builder.retVoid()
+
+    module.wrapInCallableHipKernel((addType, addKernel))
+
+  block:
+    let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
+    let mulKernel = module.addFunction(mulKernelName, mulType)
+    let blck = ctx.appendBasicBlock(mulKernel, "mulBody")
+    builder.positionAtEnd(blck)
+    let r = mulKernel.getParam(0)
+    let a = mulKernel.getParam(1)
+    let b = mulKernel.getParam(2)
+    let prod = builder.mul(a, b, "prod")
+    builder.store(prod, r)
+    builder.retVoid()
+
+    module.wrapInCallableHipKernel((mulType, mulKernel))
+
+  module.verify(AbortProcessAction)
+
+  block:
+    echo "================="
+    echo "LLVM IR output"
+    echo $module
+    echo "================="
+
+func toHex*(a: uint64): string =
+  const hexChars = "0123456789abcdef"
+  const L = 2*sizeof(uint64)
+  result = newString(L)
+  var a = a
+  for j in countdown(result.len-1, 0):
+    result[j] = hexChars[a and 0xF]
+    a = a shr 4
+
+func toString*(a: openArray[uint64]): string =
+  result = "0x"
+  for i in countdown(a.len-1, 0):
+    result.add toHex(a[i])
+
+proc getHipKernel(hipMod: HipModule, fnName: string): HipFunction =
+  check hipModuleGetFunction(result, hipMod, fnName & "_public")
+
+proc main() =
+
+  #######################################
+  # GPU init
+  let hipDevice = hipDeviceInit()
+
+  #######################################
+  # LLVM IR
+  let ctx = createContext()
+  let module = ctx.createModule("test_nnvm")
+
+  let addKernelName = "addKernel"
+  let mulKernelName = "mulKernel"
+
+  writeExampleAddMul(ctx, module, addKernelName, mulKernelName)
+  module.verify(AbortProcessAction)
+
+  #######################################
+  # Compilation
+
+  initializeFullAMDGPUTarget()
+  const triple = "amdgcn-amd-amdhsa"
+  let gcnArchName = getGcnArchName(deviceId = 0)
+
+  let machine = createTargetMachine(
+    target = toTarget(triple),
+    triple = triple,
+    cpu = cstring(gcnArchName),
+    features = "",
+    level = CodeGenLevelAggressive,
+    reloc = RelocDefault,
+    codeModel = CodeModelDefault
+  )
+
+  let objectCode = machine.emitTo[:seq[byte]](module, ObjectFile)
+  let assembly = machine.emitTo[:string](module, AssemblyFile)
+
+  module.dispose()
+  ctx.dispose()
+
+  block:
+    echo "================="
+    echo "AMD GCN output"
+    echo $assembly
+    echo "================="
+
+  let exeCode = objectCode.linkAmdGpu(gcnArchName)
+
+  #######################################
+  # GPU JIT
+  var hipCtx: HipContext
+  var hipMod: HipModule
+  check hipCtxCreate(hipCtx, 0, hipDevice)
+  check hipModuleLoadData(hipMod, exeCode[0].addr)
+  let addKernel = hipMod.getHipKernel(addKernelName)
+  let mulKernel = hipMod.getHipKernel(mulKernelName)
+
+
+  #######################################
+  # Kernel launch
+  var r{.noInit.}, a, b: array[2, uint64]
+
+  a[1] = 0x00000000000001FF'u64; a[0] = 0xFFFFFFFFFFFFFFFF'u64
+  b[1] = 0x0000000000000000'u64; b[0] = 0x0010000000000000'u64
+
+  echo "r:   ", r.toString()
+  echo "a:   ", a.toString()
+  echo "b:   ", b.toString()
+
+  var rGPU: HipDeviceptr
+  check hipMalloc(rGPU, csize_t sizeof(r))
+
+  let params = [pointer(rGPU.addr), pointer(a.addr), pointer(b.addr)]
+
+  check hipModuleLaunchKernel(
+          addKernel,
+          1, 1, 1,
+          1, 1, 1,
+          0, HipStream(nil),
+          params[0].unsafeAddr, nil)
+
+  check hipMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
+  echo "a+b: ", r.toString()
+
+  check hipModuleLaunchKernel(
+          mulKernel,
+          1, 1, 1,
+          1, 1, 1,
+          0, HipStream(nil),
+          params[0].unsafeAddr, nil)
+
+  check hipMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
+  echo "a*b: ", r.toString()
+
+  #######################################
+  # Cleanup
+
+  check hipFree(rGPU)
+  rGPU = HipDeviceptr(nil)
+
+  check hipModuleUnload(hipMod)
+  hipMod = HipModule(nil)
+
+  check hipCtxDestroy(hipCtx)
+  hipCtx = HipContext(nil)
+
+main()
diff --git a/tests/gpu/hello_world_nvidia.nim b/tests/gpu/hello_world_nvidia.nim
index 1f81ce54f..f2a069cfc 100644
--- a/tests/gpu/hello_world_nvidia.nim
+++ b/tests/gpu/hello_world_nvidia.nim
@@ -6,7 +6,9 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import constantine/platforms/llvm/[llvm, nvidia, bindings/c_abi]
+import
+  constantine/platforms/llvm/llvm,
+  constantine/math_compiler/codegen_nvidia
 
 # ############################################################
 #
@@ -136,7 +138,7 @@ proc ptxCodegenViaLlvmNvptx(module: ModuleRef, sm: tuple[major, minor: int32]):
     codeModel = CodeModelDefault
   )
 
-  machine.emitToString(module, AssemblyFile)
+  machine.emitTo[:string](module, AssemblyFile)
 
 # ############################################################
 #
@@ -153,7 +155,11 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe
 
   const triple = "nvptx64-nvidia-cuda"
   # Datalayout for NVVM IR 1.8 (CUDA 11.6)
-  const datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+  const datalayout =
+      "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-" &
+             "f32:32:32-f64:64:64-" &
+             "v16:16:16-v32:32:32-v64:64:64-v128:128:128-" &
+             "n16:32:64"
 
   # ######################################
   # LLVM IR codegen
@@ -178,7 +184,7 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe
     builder.store(sum, r)
     builder.retVoid()
 
-    module.setCallableCudaKernel((addType, addKernel))
+    module.wrapInCallableCudaKernel((addType, addKernel))
 
   block:
     let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
@@ -192,7 +198,7 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe
     builder.store(prod, r)
     builder.retVoid()
 
-    module.setCallableCudaKernel((mulType, mulKernel))
+    module.wrapInCallableCudaKernel((mulType, mulKernel))
 
   module.verify(AbortProcessAction)
 
diff --git a/tests/gpu/t_nvidia_fp.nim b/tests/gpu/t_nvidia_fp.nim
index 1384aa92d..b3aa873fe 100644
--- a/tests/gpu/t_nvidia_fp.nim
+++ b/tests/gpu/t_nvidia_fp.nim
@@ -39,21 +39,21 @@ proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Algebra, wor
 
 proc genFieldAddPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
   let fpAdd = asy.field_add_gen(cm, fp)
-  asy.module.setCallableCudaKernel(fpAdd)
+  asy.module.wrapInCallableCudaKernel(fpAdd)
   let frAdd = asy.field_add_gen(cm, fr)
-  asy.module.setCallableCudaKernel(frAdd)
+  asy.module.wrapInCallableCudaKernel(frAdd)
 
 proc genFieldSubPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
   let fpSub = asy.field_sub_gen(cm, fp)
-  asy.module.setCallableCudaKernel(fpSub)
+  asy.module.wrapInCallableCudaKernel(fpSub)
   let frSub = asy.field_sub_gen(cm, fr)
-  asy.module.setCallableCudaKernel(frSub)
+  asy.module.wrapInCallableCudaKernel(frSub)
 
 proc genFieldMulPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
   let fpMul = asy.field_mul_gen(cm, fp)
-  asy.module.setCallableCudaKernel(fpMul)
+  asy.module.wrapInCallableCudaKernel(fpMul)
   let frMul = asy.field_mul_gen(cm, fr)
-  asy.module.setCallableCudaKernel(frMul)
+  asy.module.wrapInCallableCudaKernel(frMul)
 
 # Init LLVM
 # -------------------------