From 1e34ec22929eaba7bcf1681350ec21aed8f370f7 Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Mon, 5 Aug 2024 07:23:15 +0200 Subject: [PATCH] AMDGPU JIT compiler (#453) * feat(AMD GPU): initial commit to support AMD GPU (working for ASM but not object code) * feat(AMD GPU): :fire: :fire: end to end JIT compilation to AMD GPU is working --- constantine/math_compiler/codegen_amdgpu.nim | 188 +++++ constantine/math_compiler/codegen_nvidia.nim | 12 +- constantine/platforms/abis/amdcomgr_abi.nim | 735 ++++++++++++++++++ constantine/platforms/abis/amdgpu_abi.nim | 658 ++++++++++++++++ .../{llvm/bindings => abis}/c_abi.nim | 2 +- .../{llvm/bindings => abis}/llvm_abi.nim | 261 ++++++- .../{llvm/bindings => abis}/nvidia_abi.nim | 7 +- constantine/platforms/llvm/llvm.nim | 23 +- research/codegen/x86_instr.nim | 2 +- research/codegen/x86_poc.nim | 2 +- tests/gpu/hello_world_amdgpu.nim | 211 +++++ tests/gpu/hello_world_nvidia.nim | 16 +- tests/gpu/t_nvidia_fp.nim | 12 +- 13 files changed, 2093 insertions(+), 36 deletions(-) create mode 100644 constantine/math_compiler/codegen_amdgpu.nim create mode 100644 constantine/platforms/abis/amdcomgr_abi.nim create mode 100644 constantine/platforms/abis/amdgpu_abi.nim rename constantine/platforms/{llvm/bindings => abis}/c_abi.nim (99%) rename constantine/platforms/{llvm/bindings => abis}/llvm_abi.nim (73%) rename constantine/platforms/{llvm/bindings => abis}/nvidia_abi.nim (99%) create mode 100644 tests/gpu/hello_world_amdgpu.nim diff --git a/constantine/math_compiler/codegen_amdgpu.nim b/constantine/math_compiler/codegen_amdgpu.nim new file mode 100644 index 000000000..c9e84b9ee --- /dev/null +++ b/constantine/math_compiler/codegen_amdgpu.nim @@ -0,0 +1,188 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + constantine/platforms/abis/amdgpu_abi {.all.}, + constantine/platforms/abis/amdcomgr_abi, + constantine/platforms/abis/c_abi, + constantine/platforms/llvm/llvm, + constantine/platforms/primitives, + ./ir + +export + amdgpu_abi, + Flag, flag, wrapOpenArrayLenType + +# ############################################################ +# +# AMD GPUs API +# +# ############################################################ + +# Hip Runtime API +# ------------------------------------------------------------ + +template check*(status: HipError) = + ## Check the status code of a Hip operation + ## Exit program with error if failure + + let code = status # ensure that the input expression is evaluated once only + if code != hipSuccess: + writeStackTrace() + stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n') + quit 1 + +func hipModuleGetFunction*(kernel: var HipFunction, module: HipModule, fnName: openArray[char]): HipError {.inline.}= + hipModuleGetFunction(kernel, module, fnName[0].unsafeAddr) + +proc getGcnArchName*(deviceID: int32): string = + var prop: HipDeviceProp + check hipGetDeviceProperties(prop, deviceID) + + for c in prop.gcnArchName: + if c != '\0': + result.add c + +proc hipDeviceInit*(deviceID = 0'i32): HipDevice = + + check hipInit(deviceID.uint32) + + var devCount: int32 + check hipGetDeviceCount(devCount) + if devCount == 0: + echo "hipDeviceInit error: no devices supporting AMD ROCm/HIP" + quit 1 + + var hipDevice: HipDevice + check hipDeviceGet(hipDevice, deviceID) + var name = newString(128) + check hipDeviceGetName(name[0].addr, name.len.int32, hipDevice) + echo "Using HIP Device [", deviceID, "]: ", cstring(name) + echo "AMD GCN ARCH: ", deviceID.getGcnArchName() + + return hipDevice + +# ############################################################ +# +# LLVM IR for AMD GPUs +# +# ############################################################ +# +# Note: +# __device__ functions for field and elliptic curve arithmetic +# might be compiled by default with scalar codegen +# +# We will need to either: +# - Derive explicitly a vectorized version of the warp/wave size (32) +# - Derive implicitly a vectorized version, probably with __forceinline__ + +proc wrapInCallableHipKernel*(module: ModuleRef, fn: FnDef) = + ## Create a public wrapper of a Hip device function + ## + ## A function named `addmod` can be found by appending _public + ## check hipModuleGetFunction(fnPointer, cuModule, "addmod_public") + + let pubName = fn.fnImpl.getName() & "_public" + let pubFn = module.addFunction(cstring(pubName), fn.fnTy) + + let ctx = module.getContext() + let builder = ctx.createBuilder() + defer: builder.dispose() + + let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody") + builder.positionAtEnd(blck) + + var args = newSeq[ValueRef](fn.fnTy.countParamTypes()) + for i, arg in mpairs(args): + arg = pubFn.getParam(i.uint32) + discard builder.call2(fn.fnTy, fn.fnImpl, args) + + # A public kernel must return void + builder.retVoid() + pubFn.setCallingConvention(AMDGPU_KERNEL) + +# ############################################################ +# +# Code generation +# +# ############################################################ + +template check*(status: ComgrStatus) = + ## Check the status code of a Comgr operation + ## Exit program with error if failure + + let code = status # ensure that the input expression is evaluated once only + if code != AMD_COMGR_STATUS_SUCCESS: + writeStackTrace() + stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n') + quit 1 + + +proc linkAmdGpu*(reloc_obj: seq[byte], gcnArchName: string): seq[byte] {.noInline.} = + ## Link a relocatable object code + ## into an executable that can be used through hipModuleLoadData + var roc: ComgrData + check amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, roc) + defer: check amd_comgr_release_data(roc) + + var ai: ComgrActionInfo + check amd_comgr_create_action_info(ai) + defer: check amd_comgr_destroy_action_info(ai) + + var ds: ComgrDataset + check amd_comgr_create_data_set(ds) + defer: check amd_comgr_destroy_data_set(ds) + + var dsOut: ComgrDataset + check amd_comgr_create_data_set(dsOut) + defer: check amd_comgr_destroy_data_set(dsOut) + + check roc.amd_comgr_set_data(reloc_obj.len.csize_t(), reloc_obj[0].addr) + check roc.amd_comgr_set_data_name("linkAmdGpu-input.o") + check ds.amd_comgr_data_set_add(roc) + + check ai.amd_comgr_action_info_set_isa_name( + cstring("amdgcn-amd-amdhsa--" & gcnArchName) + ) + + check amd_comgr_do_action( + AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, + info = ai, + input = ds, + output = dsOut) + + # Extract the executable + # ------------------------------------------------ + + var exe: ComgrData + check amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, exe) + defer: check amd_comgr_release_data(exe) + + check amd_comgr_action_data_get_data( + dsOut, AMD_COMGR_DATA_KIND_EXECUTABLE, + index = 0, exe) + + # Query the required buffer size + var size: csize_t + check amd_comgr_get_data( + exe, size, nil) + + # Size includes nul char + # But we use seq[byte] not a string, so Nim doesn't auto-inster a \0 + # Hence allocation size is exact. + result.setLen(int size) + + check amd_comgr_get_data( + exe, size, result[0].addr) + + +# ############################################################ +# +# Code execution +# +# ############################################################ diff --git a/constantine/math_compiler/codegen_nvidia.nim b/constantine/math_compiler/codegen_nvidia.nim index 0315ad131..19e92019d 100644 --- a/constantine/math_compiler/codegen_nvidia.nim +++ b/constantine/math_compiler/codegen_nvidia.nim @@ -7,15 +7,15 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - constantine/platforms/llvm/bindings/nvidia_abi {.all.}, - constantine/platforms/llvm/bindings/c_abi, + constantine/platforms/abis/nvidia_abi {.all.}, + constantine/platforms/abis/c_abi, constantine/platforms/llvm/[llvm, nvidia_inlineasm], constantine/platforms/primitives, ./ir export nvidia_abi, nvidia_inlineasm, - Flag, flag + Flag, flag, wrapOpenArrayLenType # ############################################################ # @@ -131,7 +131,7 @@ proc tagCudaKernel(module: ModuleRef, fn: FnDef) = ])) ) -proc setCallableCudaKernel*(module: ModuleRef, fn: FnDef) = +proc wrapInCallableCudaKernel*(module: ModuleRef, fn: FnDef) = ## Create a public wrapper of a cuda device function ## ## A function named `addmod` can be found by appending _public @@ -202,7 +202,7 @@ proc codegenNvidiaPTX*(asy: Assembler_LLVM, sm: tuple[major, minor: int32]): str errMsg.dispose() quit 1 - return machine.emitToString(asy.module, AssemblyFile) + return machine.emitTo[:string](asy.module, AssemblyFile) # ############################################################ # @@ -263,4 +263,4 @@ proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) = check cuMemFree(rGPU) check cuMemFree(aGPU) - check cuMemFree(bGPU) \ No newline at end of file + check cuMemFree(bGPU) diff --git a/constantine/platforms/abis/amdcomgr_abi.nim b/constantine/platforms/abis/amdcomgr_abi.nim new file mode 100644 index 000000000..0ce4b66d6 --- /dev/null +++ b/constantine/platforms/abis/amdcomgr_abi.nim @@ -0,0 +1,735 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# AMD Code Object Manager (comgr) +# -------------------------------------------------------------------------------------------------- +# +# https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr + +# Overview +# Unlike Nvidia GPUs that use a Virtual ISA that is then recompiled +# by the CUDA driver, AMD GPUs produce object code. +# +# That object code then must be linked by LLD +# https://llvm.org/docs/AMDGPUUsage.html#elf-code-object +# +# Unfortunately LLD is not designed as a library like the rest of LLVM +# and in particular does not provide a C API +# https://groups.google.com/g/llvm-dev/c/K30vI0AU9vg?pli=1 +# +# 1. We can link to it using C++ lld::elf::link (or lld::coff::link or ...) +# https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/lld/include/lld/Common/Driver.h#L52-L57 +# This is what is done in MLIR: +# https://reviews.llvm.org/D80676#change-KfPawdnRjasK +# +# 2. Alternatively, if we don't want to compile Nim via C++ +# we can have a .cpp file with extern "C" +# and customize the build with +# - querying the nimcache directory via std/compilersettings +# - staticExec the .cpp file +# - passL to link the resulting object +# - Use a C++ linker +# See https://github.com/xrfez/nim_mixed_c_cpp_static/blob/master/helloWorld.nim#L14-L18 +# +# 3. Yet another alternative is to call LDD +# +# 4. MCJIT or OrcJIT can bypass the need of an external linker +# But calling GPU functions is not supported +# https://llvm.org/docs/JITLink.html#jitlink-availability-and-feature-status +# +# 5. It might be possible to use AMD HIP RTC API +# as it supports linking LLVM bitcode +# https://rocm.docs.amd.com/projects/HIP/en/docs-6.0.0/user_guide/hip_rtc.html +# +# Criticisms +# +# Solution 3 seems to be wildly adopted, in Julia, IREE and Taichi +# - https://github.com/iree-org/iree/blob/26f77de/compiler/plugins/target/LLVMCPU/internal/EmbeddedLinkerTool.cpp#L42-L48 +# - https://github.com/JuliaGPU/AMDGPU.jl/blob/v0.9.6/src/compiler/codegen.jl#L148-L154 +# - https://github.com/taichi-dev/taichi/pull/6482/files#diff-9ab763eb7ff4e6aca0a97f774cc740c609d0258ac584039d0c8cae099dfea452R90 +# +# However ldd is shipped as a separate tool from LLVM, and as a cryptographic library +# we need to minimize the attack surface, i.e. someone installing a "ldd" script that would be executed byu our code. +# It's harder to replace library paths as those need root or are restricted to a shell session if overloading LD_LIBRARY_PATH +# +# Solution 1 needs temporary files +# Solution 2 too and seems hard to maintain +# Solution 4 is a non-starter +# +# Solution 5 is likely possible as the header offers the following enum value +# to pass to the linker "hiprtcJITInputType: HIPRTC_JIT_INPUT_OBJECT" +# However we don't really need the full RTC since we already did LLVM IR -> object file +# +# Looking deeper into hipRTC we see that it depends on comgr, just like the HIP runtime +# and comgr only roles is dealing with object file. +# It does use LLD under-the-hood but from a fork specialized for AMD purposes: +# https://github.com/ROCm/llvm-project/blob/rocm-6.2.0/amd/comgr/src/comgr-compiler.cpp#L614-L630 +# Hence we solve all of our concerns. + +const + # Generated from Comgr 2.6 + AMD_COMGR_INTERFACE_VERSION_MAJOR {.used.} = 2 + AMD_COMGR_INTERFACE_VERSION_MINOR {.used.} = 6 + +## \defgroup codeobjectmanager Code Object Manager +## @{ +## +## @brief The code object manager is a callable library that provides +## operations for creating and inspecting code objects. +## +## The library provides handles to various objects. Concurrent execution of +## operations is supported provided all objects accessed by each concurrent +## operation are disjoint. For example, the @p amd_comgr_data_set_t handles +## passed to operations must be disjoint, together with all the @p +## amd_comgr_data_t handles that have been added to it. The exception is that +## the default device library data object handles can be non-disjoint as they +## are imutable. +## +## The library supports generating and inspecting code objects that +## contain machine code for a certain set of instruction set +## arhitectures (isa). The set of isa supported and information about +## the properties of the isa can be queried. +## +## The library supports performing an action that can take data +## objects of one kind, and generate new data objects of another kind. +## +## Data objects are referenced using handles using @p +## amd_comgr_data_t. The kinds of data objects are given +## by @p amd_comgr_data_kind_t. +## +## To perform an action, two @p amd_comgr_data_set_t +## objects are created. One is used to hold all the data objects +## needed by an action, and other is updated by the action with all +## the result data objects. In addition, an @p +## amd_comgr_action_info_t is created to hold +## information that controls the action. These are then passed to @p +## amd_comgr_do_action to perform an action specified by +## @p amd_comgr_action_kind_t. +## +## Some data objects can have associated metadata. There are +## operations for querying this metadata. +## +## The default device library that satisfies the requirements of the +## compiler action can be obtained. +## +## The library inspects some environment variables to aid in debugging. These +## include: +## - @p AMD_COMGR_SAVE_TEMPS: If this is set, and is not "0", the library does +## not delete temporary files generated while executing compilation actions. +## These files do not appear in the current working directory, but are +## instead left in a platform-specific temporary directory (/tmp on Linux and +## C:\Temp or the path found in the TEMP environment variable on Windows). +## - @p AMD_COMGR_REDIRECT_LOGS: If this is not set, or is set to "0", logs are +## returned to the caller as normal. If this is set to "stdout"/"-" or +## "stderr", logs are instead redirected to the standard output or error +## stream, respectively. If this is set to any other value, it is interpreted +## as a filename which logs should be appended to. Logs may be redirected +## irrespective of whether logging is enabled. +## - @p AMD_COMGR_EMIT_VERBOSE_LOGS: If this is set, and is not "0", logs will +## include additional Comgr-specific informational messages. +## +## +## @brief Status codes. + +type + ComgrStatus* {.size: sizeof(cint).} = enum + # From amd_comgr_status_t + AMD_COMGR_STATUS_SUCCESS = 0x0, ## The function has been executed successfully. + AMD_COMGR_STATUS_ERROR = 0x1, ## A generic error has occurred. + AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT = 0x2, + ## One of the actual arguments does not meet a precondition stated + ## in the documentation of the corresponding formal argument. This + ## includes both invalid Action types, and invalid arguments to + ## valid Action types. + AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES = 0x3 ## Failed to allocate the necessary resources. + +type + ComgrDataKind* {.size: sizeof(cint).} = enum + # From amd_comgr_data_kind_t + AMD_COMGR_DATA_KIND_UNDEF = 0x0, ## No data is available. + AMD_COMGR_DATA_KIND_SOURCE = 0x1, ## The data is a textual main source. + AMD_COMGR_DATA_KIND_INCLUDE = 0x2, + ## The data is a textual source that is included in the main source + ## or other include source. + AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER = 0x3, + ## The data is a precompiled-header source that is included in the main + ## source or other include source. + AMD_COMGR_DATA_KIND_DIAGNOSTIC = 0x4, ## The data is a diagnostic output. + AMD_COMGR_DATA_KIND_LOG = 0x5, ## The data is a textual log output. + AMD_COMGR_DATA_KIND_BC = 0x6, ## The data is compiler LLVM IR bit code for a specific isa. + AMD_COMGR_DATA_KIND_RELOCATABLE = 0x7, ## The data is a relocatable machine code object for a specific isa. + AMD_COMGR_DATA_KIND_EXECUTABLE = 0x8, + ## The data is an executable machine code object for a specific + ## isa. An executable is the kind of code object that can be loaded + ## and executed. + AMD_COMGR_DATA_KIND_BYTES = 0x9, ## The data is a block of bytes. + AMD_COMGR_DATA_KIND_FATBIN = 0x10, ## The data is a fat binary (clang-offload-bundler output). + AMD_COMGR_DATA_KIND_AR = 0x11, ## The data is an archive. + AMD_COMGR_DATA_KIND_BC_BUNDLE = 0x12, ## The data is a bundled bitcode. + AMD_COMGR_DATA_KIND_AR_BUNDLE = 0x13 ## The data is a bundled archive. + + + +type + ComGrActionKind* {.size: sizeof(cint).} = enum + ## + ## @brief The kinds of actions that can be performed. + ## + # From amd_comgr_action_kind_t + AMD_COMGR_ACTION_SOURCE_TO_PREPROCESSOR = 0x0, + ## Preprocess each source data object in @p input in order. For each + ## successful preprocessor invocation, add a source data object to @p result. + ## Resolve any include source names using the names of include data objects + ## in @p input. Resolve any include relative path names using the working + ## directory path in @p info. Preprocess the source for the language in @p + ## info. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if any preprocessing fails. + ## + ## Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name or language is not set in @p info. + AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS = 0x1, + ## Copy all existing data objects in @p input to @p output, then add the + ## device-specific and language-specific precompiled headers required for + ## compilation. + ## + ## Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2 + ## and @p AMD_COMGR_LANGUAGE_OPENCL_2_0. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language + ## is not set in @p info, or the language is not supported. + AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC = 0x2, + ## Compile each source data object in @p input in order. For each + ## successful compilation add a bc data object to @p result. Resolve + ## any include source names using the names of include data objects + ## in @p input. Resolve any include relative path names using the + ## working directory path in @p info. Produce bc for isa name in @p + ## info. Compile the source for the language in @p info. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if any compilation + ## fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name or language is not set in @p info. + AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES = 0x3, + ## Copy all existing data objects in @p input to @p output, then add the + ## device-specific and language-specific bitcode libraries required for + ## compilation. + ## + ## Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2, + ## @p AMD_COMGR_LANGUAGE_OPENCL_2_0, and @p AMD_COMGR_LANGUAGE_HIP. + ## + ## The options in @p info should be set to a set of language-specific flags. + ## For OpenCL and HIP these include: + ## + ## correctly_rounded_sqrt + ## daz_opt + ## finite_only + ## unsafe_math + ## wavefrontsize64 + ## + ## For example, to enable daz_opt and unsafe_math, the options should be set + ## as: + ## + ## const char *options[] = {"daz_opt, "unsafe_math"}; + ## size_t optionsCount = sizeof(options) / sizeof(options[0]); + ## + ## amd_comgr_action_info_set_option_list(info, options, optionsCount); + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language + ## is not set in @p info, the language is not supported, an unknown + ## language-specific flag is supplied, or a language-specific flag is + ## repeated. + ## + ## @deprecated since 1.7 + ## @warning This action, followed by @c AMD_COMGR_ACTION_LINK_BC_TO_BC, may + ## result in subtle bugs due to incorrect linking of the device libraries. + ## The @c + ## AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC action can + ## be used as a workaround which ensures the link occurs correctly. + AMD_COMGR_ACTION_LINK_BC_TO_BC = 0x4, ## + ## Link a collection of bitcodes, bundled bitcodes, and bundled bitcode + ## archives in @p into a single composite (unbundled) bitcode @p. + ## Any device library bc data object must be explicitly added to @p input if + ## needed. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if the link or unbundling fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name is not set in @p info and does not match the isa name + ## of all bc data objects in @p input. + AMD_COMGR_ACTION_OPTIMIZE_BC_TO_BC = 0x5, + ## Optimize each bc data object in @p input and create an optimized bc data + ## object to @p result. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if the optimization fails. + ## + ## Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name is not set in @p info and does not match the isa name + ## of all bc data objects in @p input. + AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE = 0x6, + ## Perform code generation for each bc data object in @p input in + ## order. For each successful code generation add a relocatable data + ## object to @p result. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if any code + ## generation fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name is not set in @p info and does not match the isa name + ## of all bc data objects in @p input. + AMD_COMGR_ACTION_CODEGEN_BC_TO_ASSEMBLY = 0x7, + ## Perform code generation for each bc data object in @p input in + ## order. For each successful code generation add an assembly source data + ## object to @p result. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if any code + ## generation fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name is not set in @p info and does not match the isa name + ## of all bc data objects in @p input. + AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_RELOCATABLE = 0x8, + ## Link each relocatable data object in @p input together and add + ## the linked relocatable data object to @p result. Any device + ## library relocatable data object must be explicitly added to @p + ## input if needed. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if the link fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name is not set in @p info and does not match the isa name + ## of all relocatable data objects in @p input. + AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE = 0x9, + ## Link each relocatable data object in @p input together and add + ## the linked executable data object to @p result. Any device + ## library relocatable data object must be explicitly added to @p + ## input if needed. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if the link fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name is not set in @p info and does not match the isa name + ## of all relocatable data objects in @p input. + AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE = 0xA, + ## Assemble each source data object in @p input in order into machine code. + ## For each successful assembly add a relocatable data object to @p result. + ## Resolve any include source names using the names of include data objects in + ## @p input. Resolve any include relative path names using the working + ## directory path in @p info. Produce relocatable for isa name in @p info. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if any assembly fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name is not set in + ## @p info. + AMD_COMGR_ACTION_DISASSEMBLE_RELOCATABLE_TO_SOURCE = 0xB, + ## Disassemble each relocatable data object in @p input in + ## order. For each successful disassembly add a source data object to + ## @p result. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if any disassembly + ## fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name is not set in @p info and does not match the isa name + ## of all relocatable data objects in @p input. + AMD_COMGR_ACTION_DISASSEMBLE_EXECUTABLE_TO_SOURCE = 0xC, + ## Disassemble each executable data object in @p input in order. For + ## each successful disassembly add a source data object to @p result. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR if any disassembly + ## fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name is not set in @p info and does not match the isa name + ## of all relocatable data objects in @p input. + AMD_COMGR_ACTION_DISASSEMBLE_BYTES_TO_SOURCE = 0xD, + ## Disassemble each bytes data object in @p input in order. For each + ## successful disassembly add a source data object to @p + ## result. Only simple assembly language commands are generate that + ## corresponf to raw bytes are supported, not any directives that + ## control the code object layout, or symbolic branch targets or + ## names. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if any disassembly + ## fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name is not set in @p info + AMD_COMGR_ACTION_COMPILE_SOURCE_TO_FATBIN = 0xE, + ## Compile each source data object in @p input in order. For each + ## successful compilation add a fat binary to @p result. Resolve + ## any include source names using the names of include data objects + ## in @p input. Resolve any include relative path names using the + ## working directory path in @p info. Produce fat binary for isa name in @p + ## info. Compile the source for the language in @p info. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if any compilation + ## fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name or language is not set in @p info. + ## + ## @deprecated since 2.5 + ## @see in-process compilation via + ## AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, etc. + ## instead + + AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC = 0xF + ## Compile each source data object in @p input in order. For each + ## successful compilation add a bc data object to @p result. Resolve + ## any include source names using the names of include data objects + ## in @p input. Resolve any include relative path names using the + ## working directory path in @p info. Produce bc for isa name in @p + ## info. Compile the source for the language in @p info. Link against + ## the device-specific and language-specific bitcode device libraries + ## required for compilation. + ## + ## Return @p AMD_COMGR_STATUS_ERROR if any compilation + ## fails. + ## + ## Return @p + ## AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT + ## if isa name or language is not set in @p info. + +type + ComgrData* {.bycopy.} = object + ## @brief A handle to a data object. + ## + ## Data objects are used to hold the data which is either an input or + ## output of a code object manager action. + # From amd_comgr_data_t + handle*: uint64 + + ComgrActionInfo* {.bycopy.} = object + ## @brief A handle to an action information object. + ## + ## An action information object holds all the necessary information, + ## excluding the input data objects, required to perform an action. + # From amd_comgr_action_info_t + handle*: uint64 + +type + ComgrDataset* {.bycopy.} = object + ## @brief A handle to an action information object. + ## + ## An action information object holds all the necessary information, + ## excluding the input data objects, required to perform an action. + # From amd_comgr_data_set_t + handle*: uint64 + + +const libPath = "/opt/rocm/lib/" # For now, only support Linux +static: echo "[Constantine] Will search AMD Comgr in $LD_LIBRARY_PATH and " & libPath & "libamd_comgr.so" +const libAmdComgr = "(libamd_comgr.so|" & libPath & "libamd_comgr.so)" + +{.push noconv, importc, dynlib: libAmdComgr.} + +proc amd_comgr_create_data*(kind: ComgrDataKind; data: var ComgrData): ComgrStatus + ## @brief Create a data object that can hold data of a specified kind. + ## + ## Data objects are reference counted and are destroyed when the + ## reference count reaches 0. When a data object is created its + ## reference count is 1, it has 0 bytes of data, it has an empty name, + ## and it has no metadata. + ## + ## @param[in] kind The kind of data the object is intended to hold. + ## + ## @param[out] data A handle to the data object created. Its reference + ## count is set to 1. + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has + ## been executed successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p + ## kind is an invalid data kind, or @p + ## AMD_COMGR_DATA_KIND_UNDEF. @p data is NULL. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES + ## Unable to create the data object as out of resources. + +proc amd_comgr_release_data*(data: ComgrData): ComgrStatus + ## @brief Indicate that no longer using a data object handle. + ## + ## The reference count of the associated data object is + ## decremented. If it reaches 0 it is destroyed. + ## + ## @param[in] data The data object to release. + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has + ## been executed successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p + ## data is an invalid data object, or has kind @p + ## AMD_COMGR_DATA_KIND_UNDEF. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES + ## Unable to update the data object as out of resources. + +proc amd_comgr_set_data*(data: ComgrData; size: csize_t; bytes: pointer): ComgrStatus + ## @brief Set the data content of a data object to the specified + ## bytes. + ## + ## Any previous value of the data object is overwritten. Any metadata + ## associated with the data object is also replaced which invalidates + ## all metadata handles to the old metadata. + ## + ## @param[in] data The data object to update. + ## + ## @param[in] size The number of bytes in the data specified by @p bytes. + ## + ## @param[in] bytes The bytes to set the data object to. The bytes are + ## copied into the data object and can be freed after the call. + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has + ## been executed successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p + ## data is an invalid data object, or has kind @p + ## AMD_COMGR_DATA_KIND_UNDEF. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES + ## Unable to update the data object as out of resources. + +proc amd_comgr_set_data_name*(data: ComgrData; name: cstring): ComgrStatus + ## @brief Set the name associated with a data object. + ## + ## When compiling, the full name of an include directive is used to + ## reference the contents of the include data object with the same + ## name. The name may also be used for other data objects in log and + ## diagnostic output. + ## + ## @param[in] data The data object to update. + ## + ## @param[in] name A null terminated string that specifies the name to + ## use for the data object. If NULL then the name is set to the empty + ## string. + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has + ## been executed successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p + ## data is an invalid data object, or has kind @p + ## AMD_COMGR_DATA_KIND_UNDEF. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES + ## Unable to update the data object as out of resources. + +proc amd_comgr_get_data*(data: ComgrData; size: var csize_t; bytes: pointer): ComgrStatus + ## @brief Get the data object name and/or name length. + ## + ## @param[in] data The data object to query. + ## + ## @param[in, out] size On entry, the size of @p name. On return, the size of + ## the data object name including the terminating null character. + ## + ## @param[out] name If not NULL, then the first @p size characters of the + ## data object name are copied. If @p name is NULL, only @p size is updated + ## (useful in order to find the size of buffer required to copy the name). + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has + ## been executed successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p + ## data is an invalid data object, or has kind @p + ## AMD_COMGR_DATA_KIND_UNDEF. @p size is NULL. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES + ## Unable to update the data object as out of resources. + +proc amd_comgr_create_action_info*(action_info: var ComgrActionInfo): ComgrStatus + ## @brief Create an action info object. + ## + ## @param[out] action_info A handle to the action info object created. + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has + ## been executed successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p + ## action_info is NULL. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES + ## Unable to create the action info object as out of resources. + +proc amd_comgr_destroy_action_info*(action_info: ComgrActionInfo): ComgrStatus + ## @brief Destroy an action info object. + ## + ## @param[in] action_info A handle to the action info object to destroy. + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has + ## been executed successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p + ## action_info is an invalid action info object. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES + ## Unable to update action info object as out of resources. + +proc amd_comgr_action_info_set_isa_name*(action_info: ComgrActionInfo; + isa_name: cstring): ComgrStatus + + ## @brief Set the isa name of an action info object. + ## + ## When an action info object is created it has no isa name. Some + ## actions require that the action info object has an isa name + ## defined. + ## + ## @param[in] action_info A handle to the action info object to be + ## updated. + ## + ## @param[in] isa_name A null terminated string that is the isa name. If NULL + ## or the empty string then the isa name is cleared. The isa name is defined as + ## the Code Object Target Identification string, described at + ## https://llvm.org/docs/AMDGPUUsage.html#code-object-target-identification + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has + ## been executed successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p + ## action_info is an invalid action info object. @p isa_name is not an + ## isa name supported by this version of the code object manager + ## library. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES + ## Unable to update action info object as out of resources. + ## + ## ----- + ## Example ISA: "amdgcn-amd-amdhsa--gfx900" + +proc amd_comgr_create_data_set*(data_set: var ComgrDataset): ComgrStatus + ## @brief Create a data set object. + ## + ## @param[out] data_set A handle to the data set created. Initially it + ## contains no data objects. + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed + ## successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is NULL. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to create the data + ## set object as out of resources. + +proc amd_comgr_destroy_data_set*(data_set: ComgrDataset): ComgrStatus + ## @brief Destroy a data set object. + ## + ## The reference counts of any associated data objects are decremented. Any + ## handles to the data set object become invalid. + ## + ## @param[in] data_set A handle to the data set object to destroy. + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed + ## successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid + ## data set object. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update data set + ## object as out of resources. + +proc amd_comgr_data_set_add*(data_set: ComgrDataset; data: ComgrData): ComgrStatus + ## @brief Add a data object to a data set object if it is not already added. + ## + ## The reference count of the data object is incremented. + ## + ## @param[in] data_set A handle to the data set object to be updated. + ## + ## @param[in] data A handle to the data object to be added. If @p data_set + ## already has the specified handle present, then it is not added. The order + ## that data objects are added is preserved. + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed + ## successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid + ## data set object. @p data is an invalid data object; has undef kind; has + ## include kind but does not have a name. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to update data set + ## object as out of resources. + +proc amd_comgr_do_action*(kind: ComGrActionKind; + info: ComgrActionInfo; + input: ComgrDataset; output: ComgrDataset): ComgrStatus + + ## @brief Perform an action. + ## + ## Each action ignores any data objects in @p input that it does not + ## use. If logging is enabled in @info then @p result will have a log + ## data object added. Any diagnostic data objects produced by the + ## action will be added to @p result. See the description of each + ## action in @p amd_comgr_action_kind_t. + ## + ## @param[in] kind The action to perform. + ## + ## @param[in] info The action info to use when performing the action. + ## + ## @param[in] input The input data objects to the @p kind action. + ## + ## @param[out] result Any data objects are removed before performing + ## the action which then adds all data objects produced by the action. + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has + ## been executed successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR An error was + ## reported when executing the action. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p + ## kind is an invalid action kind. @p input_data or @p result_data are + ## invalid action data object handles. See the description of each + ## action in @p amd_comgr_action_kind_t for other + ## conditions that result in this status. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES + ## Unable to update the data object as out of resources. + +proc amd_comgr_action_data_get_data*(data_set: ComgrDataset; + data_kind: ComgrDataKind; + index: csize_t; data: var ComgrData): ComgrStatus + ## @brief Return the Nth data object of a specified data kind that is added to a + ## data set object. + ## + ## The reference count of the returned data object is incremented. + ## + ## @param[in] data_set A handle to the data set object to be queried. + ## + ## @param[in] data_kind The data kind of the data object to be returned. + ## + ## @param[in] index The index of the data object of data kind @data_kind to be + ## returned. The first data object is index 0. The order of data objects matches + ## the order that they were added to the data set object. + ## + ## @param[out] data The data object being requested. + ## + ## @retval ::AMD_COMGR_STATUS_SUCCESS The function has been executed + ## successfully. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data_set is an invalid + ## data set object. @p data_kind is an invalid data kind or @p + ## AMD_COMGR_DATA_KIND_UNDEF. @p index is greater than the number of data + ## objects of kind @p data_kind. @p data is NULL. + ## + ## @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to query data set + ## object as out of resources. + +{.pop.} # noconv, importc, dynlib: libAmdComgr diff --git a/constantine/platforms/abis/amdgpu_abi.nim b/constantine/platforms/abis/amdgpu_abi.nim new file mode 100644 index 000000000..d2e6d6629 --- /dev/null +++ b/constantine/platforms/abis/amdgpu_abi.nim @@ -0,0 +1,658 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +# ############################################################ +# +# Bindings to AMD GPUs libraries +# +# ############################################################ + +import ./c_abi + +# ############################################################ +# +# HIP +# +# ############################################################ + +# Cuda Driver API -> Hip porting guide +# - https://rocm.docs.amd.com/projects/HIP/en/docs-5.7.1/user_guide/hip_porting_driver_api.html +# - https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html +# +# c2nim on /opt/rocm/include/hip/hip_runtime_api.h +# or just copy-pasting can be used. + +const libPath = "/opt/rocm/lib/" # For now, only support Linux +static: echo "[Constantine] Will search AMD HIP runtime in $LD_LIBRARY_PATH and " & libPath & "libamdhip64.so" +const libAmdHip = "(libamdhip64.so|" & libPath & "libamdhip64.so)" + +type + HipError* {.size: sizeof(cint).} = enum + ## hipError_t + hipSuccess = 0 ## Successful completion. + hipErrorInvalidValue = 1 ## One or more of the parameters passed to the API call is NULL + ## or not in an acceptable range. + # hipErrorOutOfMemory = 2 ## out of memory range. + ## Deprecated + hipErrorMemoryAllocation = 2 ## Memory allocation error. + # hipErrorNotInitialized = 3 ## Invalid not initialized + ## Deprecated + hipErrorInitializationError = 3 + hipErrorDeinitialized = 4 ## Deinitialized + hipErrorProfilerDisabled = 5 + hipErrorProfilerNotInitialized = 6 + hipErrorProfilerAlreadyStarted = 7 + hipErrorProfilerAlreadyStopped = 8 + hipErrorInvalidConfiguration = 9 ## Invalide configuration + hipErrorInvalidPitchValue = 12 ## Invalid pitch value + hipErrorInvalidSymbol = 13 ## Invalid symbol + hipErrorInvalidDevicePointer = 17 ## Invalid Device Pointer + hipErrorInvalidMemcpyDirection = 21 ## Invalid memory copy direction + hipErrorInsufficientDriver = 35 + hipErrorMissingConfiguration = 52 + hipErrorPriorLaunchFailure = 53 + hipErrorInvalidDeviceFunction = 98 ## Invalid device function + hipErrorNoDevice = 100 ## Call to hipGetDeviceCount returned 0 devices + hipErrorInvalidDevice = 101 ## DeviceID must be in range from 0 to compute-devices. + hipErrorInvalidImage = 200 ## Invalid image + hipErrorInvalidContext = 201 ## Produced when input context is invalid. + hipErrorContextAlreadyCurrent = 202 + # hipErrorMapFailed = 205 + ## Deprecated + hipErrorMapBufferObjectFailed = 205 ## Produced when the IPC memory attach failed from ROCr. + hipErrorUnmapFailed = 206 + hipErrorArrayIsMapped = 207 + hipErrorAlreadyMapped = 208 + hipErrorNoBinaryForGpu = 209 + hipErrorAlreadyAcquired = 210 + hipErrorNotMapped = 211 + hipErrorNotMappedAsArray = 212 + hipErrorNotMappedAsPointer = 213 + hipErrorECCNotCorrectable = 214 + hipErrorUnsupportedLimit = 215 ## Unsupported limit + hipErrorContextAlreadyInUse = 216 ## The context is already in use + hipErrorPeerAccessUnsupported = 217 + hipErrorInvalidKernelFile = 218 ## In CUDA DRV it is CUDA_ERROR_INVALID_PTX + hipErrorInvalidGraphicsContext = 219 + hipErrorInvalidSource = 300 ## Invalid source. + hipErrorFileNotFound = 301 ## the file is not found. + hipErrorSharedObjectSymbolNotFound = 302 + hipErrorSharedObjectInitFailed = 303 ## Failed to initialize shared object. + hipErrorOperatingSystem = 304 ## Not the correct operating system + # hipErrorInvalidHandle = 400 ## Invalide handle + ## Deprecated + hipErrorInvalidResourceHandle = 400 ## Resource handle (hipEvent_t or hipStream_t) invalid. + hipErrorIllegalState = 401 ## Resource required is not in a valid state to perform operation. + hipErrorNotFound = 500 ## Not found + hipErrorNotReady = 600 ## Indicates that asynchronous operations enqueued earlier are not + ## ready. This is not actually an error but is used to distinguish + ## from hipSuccess (which indicates completion). APIs that return + ## this error include hipEventQuery and hipStreamQuery. + hipErrorIllegalAddress = 700 + hipErrorLaunchOutOfResources = 701 ## Out of resources error. + hipErrorLaunchTimeOut = 702 ## Timeout for the launch. + hipErrorPeerAccessAlreadyEnabled = 704 ## Peer access was already enabled from the current device. + hipErrorPeerAccessNotEnabled = 705 ## Peer access was never enabled from the current device. + hipErrorSetOnActiveProcess = 708 ## The process is active. + hipErrorContextIsDestroyed = 709 ## The context is already destroyed + hipErrorAssert = 710 ## Produced when the kernel calls assert. + hipErrorHostMemoryAlreadyRegistered = 712 ## Produced when trying to lock a page-locked memory. + hipErrorHostMemoryNotRegistered = 713 ## Produced when trying to unlock a non-page-locked memory. + hipErrorLaunchFailure = 719 ## An exception occurred on the device while executing a kernel. + hipErrorCooperativeLaunchTooLarge = + 720 ## This error indicates that the number of blocks launched per grid for a kernel + ## that was launched via cooperative launch APIs exceeds the maximum number of + ## allowed blocks for the current device + hipErrorNotSupported = 801 ## Produced when the hip API is not supported/implemented + hipErrorStreamCaptureUnsupported = 900 ## The operation is not permitted when the stream + ## is capturing. + hipErrorStreamCaptureInvalidated = 901 ## The current capture sequence on the stream + ## has been invalidated due to a previous error. + hipErrorStreamCaptureMerge = 902 ## The operation would have resulted in a merge of + ## two independent capture sequences. + hipErrorStreamCaptureUnmatched = 903 ## The capture was not initiated in this stream. + hipErrorStreamCaptureUnjoined = 904 ## The capture sequence contains a fork that was not + ## joined to the primary stream. + hipErrorStreamCaptureIsolation = 905 ## A dependency would have been created which crosses + ## the capture sequence boundary. Only implicit + ## in-stream ordering dependencies are allowed + ## to cross the boundary + hipErrorStreamCaptureImplicit = 906 ## The operation would have resulted in a disallowed + ## implicit dependency on a current capture sequence + ## from hipStreamLegacy. + hipErrorCapturedEvent = 907 ## The operation is not permitted on an event which was last + ## recorded in a capturing stream. + hipErrorStreamCaptureWrongThread = 908 ## A stream capture sequence not initiated with + ## the hipStreamCaptureModeRelaxed argument to + ## hipStreamBeginCapture was passed to + ## hipStreamEndCapture in a different thread. + hipErrorGraphExecUpdateFailure = 910 ## This error indicates that the graph update + ## not performed because it included changes which + ## violated constraintsspecific to instantiated graph + ## update. + hipErrorUnknown = 999 ## Unknown error. + + ## HSA Runtime Error Codes start here. + hipErrorRuntimeMemory = 1052 ## HSA runtime memory call returned error. Typically not seen + ## in production systems. + hipErrorRuntimeOther = 1053 ## HSA runtime call other than memory returned error. Typically + ## not seen in production systems. + hipErrorTbd ## Marker that more error codes are needed. + + + HipDeviceAttribute* {.size: sizeof(cint).} = enum + ## hipDeviceAttribute_t + + # hipDeviceAttributeCudaCompatibleBegin = 0 + + hipDeviceAttributeCudaCompatibleBegin = 0 ## Whether ECC support is enabled. + hipDeviceAttributeAccessPolicyMaxWindowSize ## Cuda only. The maximum size of the window policy in bytes. + hipDeviceAttributeAsyncEngineCount ## Asynchronous engines number. + hipDeviceAttributeCanMapHostMemory ## Whether host memory can be mapped into device address space + hipDeviceAttributeCanUseHostPointerForRegisteredMem ## Device can access host registered memory + ## at the same virtual address as the CPU + hipDeviceAttributeClockRate ## Peak clock frequency in kilohertz. + hipDeviceAttributeComputeMode ## Compute mode that device is currently in. + hipDeviceAttributeComputePreemptionSupported ## Device supports Compute Preemption. + hipDeviceAttributeConcurrentKernels ## Device can possibly execute multiple kernels concurrently. + hipDeviceAttributeConcurrentManagedAccess ## Device can coherently access managed memory concurrently with the CPU + hipDeviceAttributeCooperativeLaunch ## Support cooperative launch + hipDeviceAttributeCooperativeMultiDeviceLaunch ## Support cooperative launch on multiple devices + hipDeviceAttributeDeviceOverlap ## Device can concurrently copy memory and execute a kernel. + ## Deprecated. Use instead asyncEngineCount. + hipDeviceAttributeDirectManagedMemAccessFromHost ## Host can directly access managed memory on + ## the device without migration + hipDeviceAttributeGlobalL1CacheSupported ## Device supports caching globals in L1 + hipDeviceAttributeHostNativeAtomicSupported ## Link between the device and the host supports native atomic operations + hipDeviceAttributeIntegrated ## Device is integrated GPU + hipDeviceAttributeIsMultiGpuBoard ## Multiple GPU devices. + hipDeviceAttributeKernelExecTimeout ## Run time limit for kernels executed on the device + hipDeviceAttributeL2CacheSize ## Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. + hipDeviceAttributeLocalL1CacheSupported ## caching locals in L1 is supported + hipDeviceAttributeLuid ## 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms + hipDeviceAttributeLuidDeviceNodeMask ## Luid device node mask. Undefined on TCC and non-Windows platforms + hipDeviceAttributeComputeCapabilityMajor ## Major compute capability version number. + hipDeviceAttributeManagedMemory ## Device supports allocating managed memory on this system + hipDeviceAttributeMaxBlocksPerMultiProcessor ## Max block size per multiprocessor + hipDeviceAttributeMaxBlockDimX ## Max block size in width. + hipDeviceAttributeMaxBlockDimY ## Max block size in height. + hipDeviceAttributeMaxBlockDimZ ## Max block size in depth. + hipDeviceAttributeMaxGridDimX ## Max grid size in width. + hipDeviceAttributeMaxGridDimY ## Max grid size in height. + hipDeviceAttributeMaxGridDimZ ## Max grid size in depth. + hipDeviceAttributeMaxSurface1D ## Maximum size of 1D surface. + hipDeviceAttributeMaxSurface1DLayered ## Cuda only. Maximum dimensions of 1D layered surface. + hipDeviceAttributeMaxSurface2D ## Maximum dimension (width height) of 2D surface. + hipDeviceAttributeMaxSurface2DLayered ## Cuda only. Maximum dimensions of 2D layered surface. + hipDeviceAttributeMaxSurface3D ## Maximum dimension (width height depth) of 3D surface. + hipDeviceAttributeMaxSurfaceCubemap ## Cuda only. Maximum dimensions of Cubemap surface. + hipDeviceAttributeMaxSurfaceCubemapLayered ## Cuda only. Maximum dimension of Cubemap layered surface. + hipDeviceAttributeMaxTexture1DWidth ## Maximum size of 1D texture. + hipDeviceAttributeMaxTexture1DLayered ## Maximum dimensions of 1D layered texture. + hipDeviceAttributeMaxTexture1DLinear ## Maximum number of elements allocatable in a 1D linear texture. + ## Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda. + hipDeviceAttributeMaxTexture1DMipmap ## Maximum size of 1D mipmapped texture. + hipDeviceAttributeMaxTexture2DWidth ## Maximum dimension width of 2D texture. + hipDeviceAttributeMaxTexture2DHeight ## Maximum dimension hight of 2D texture. + hipDeviceAttributeMaxTexture2DGather ## Maximum dimensions of 2D texture if gather operations performed. + hipDeviceAttributeMaxTexture2DLayered ## Maximum dimensions of 2D layered texture. + hipDeviceAttributeMaxTexture2DLinear ## Maximum dimensions (width height pitch) of 2D textures bound to pitched memory. + hipDeviceAttributeMaxTexture2DMipmap ## Maximum dimensions of 2D mipmapped texture. + hipDeviceAttributeMaxTexture3DWidth ## Maximum dimension width of 3D texture. + hipDeviceAttributeMaxTexture3DHeight ## Maximum dimension height of 3D texture. + hipDeviceAttributeMaxTexture3DDepth ## Maximum dimension depth of 3D texture. + hipDeviceAttributeMaxTexture3DAlt ## Maximum dimensions of alternate 3D texture. + hipDeviceAttributeMaxTextureCubemap ## Maximum dimensions of Cubemap texture + hipDeviceAttributeMaxTextureCubemapLayered ## Maximum dimensions of Cubemap layered texture. + hipDeviceAttributeMaxThreadsDim ## Maximum dimension of a block + hipDeviceAttributeMaxThreadsPerBlock ## Maximum number of threads per block. + hipDeviceAttributeMaxThreadsPerMultiProcessor ## Maximum resident threads per multiprocessor. + hipDeviceAttributeMaxPitch ## Maximum pitch in bytes allowed by memory copies + hipDeviceAttributeMemoryBusWidth ## Global memory bus width in bits. + hipDeviceAttributeMemoryClockRate ## Peak memory clock frequency in kilohertz. + hipDeviceAttributeComputeCapabilityMinor ## Minor compute capability version number. + hipDeviceAttributeMultiGpuBoardGroupID ## Unique ID of device group on the same multi-GPU board + hipDeviceAttributeMultiprocessorCount ## Number of multiprocessors on the device. + hipDeviceAttributeUnused1 ## Previously hipDeviceAttributeName + hipDeviceAttributePageableMemoryAccess ## Device supports coherently accessing pageable memory + ## without calling hipHostRegister on it + hipDeviceAttributePageableMemoryAccessUsesHostPageTables ## Device accesses pageable memory via the host's page tables + hipDeviceAttributePciBusId ## PCI Bus ID. + hipDeviceAttributePciDeviceId ## PCI Device ID. + hipDeviceAttributePciDomainID ## PCI Domain ID. + hipDeviceAttributePersistingL2CacheMaxSize ## Maximum l2 persisting lines capacity in bytes + hipDeviceAttributeMaxRegistersPerBlock ## 32-bit registers available to a thread block. This number is shared + ## by all thread blocks simultaneously resident on a multiprocessor. + hipDeviceAttributeMaxRegistersPerMultiprocessor ## 32-bit registers available per block. + hipDeviceAttributeReservedSharedMemPerBlock ## Shared memory reserved by CUDA driver per block. + hipDeviceAttributeMaxSharedMemoryPerBlock ## Maximum shared memory available per block in bytes. + hipDeviceAttributeSharedMemPerBlockOptin ## Maximum shared memory per block usable by special opt in. + hipDeviceAttributeSharedMemPerMultiprocessor ## Shared memory available per multiprocessor. + hipDeviceAttributeSingleToDoublePrecisionPerfRatio ## Cuda only. Performance ratio of single precision to double precision. + hipDeviceAttributeStreamPrioritiesSupported ## Whether to support stream priorities. + hipDeviceAttributeSurfaceAlignment ## Alignment requirement for surfaces + hipDeviceAttributeTccDriver ## Cuda only. Whether device is a Tesla device using TCC driver + hipDeviceAttributeTextureAlignment ## Alignment requirement for textures + hipDeviceAttributeTexturePitchAlignment ## Pitch alignment requirement for 2D texture references bound to pitched memory; + hipDeviceAttributeTotalConstantMemory ## Constant memory size in bytes. + hipDeviceAttributeTotalGlobalMem ## Global memory available on devicice. + hipDeviceAttributeUnifiedAddressing ## Cuda only. An unified address space shared with the host. + hipDeviceAttributeUnused2 ## Previously hipDeviceAttributeUuid + hipDeviceAttributeWarpSize ## Warp size in threads. + hipDeviceAttributeMemoryPoolsSupported ## Device supports HIP Stream Ordered Memory Allocator + hipDeviceAttributeVirtualMemoryManagementSupported ## Device supports HIP virtual memory management + hipDeviceAttributeHostRegisterSupported ## Can device support host memory registration via hipHostRegister + hipDeviceAttributeCudaCompatibleEnd = 9999 + + # hipDeviceAttributeAmdSpecificBegin = 10000 + # ---------------------------------------------------------------------------- + + hipDeviceAttributeClockInstructionRate = 10000 ## Frequency in khz of the timer used by the device-side "clock*" + hipDeviceAttributeUnused3 ## Previously hipDeviceAttributeArch + hipDeviceAttributeMaxSharedMemoryPerMultiprocessor ## Maximum Shared Memory PerMultiprocessor. + hipDeviceAttributeUnused4 ## Previously hipDeviceAttributeGcnArch + hipDeviceAttributeUnused5 ## Previously hipDeviceAttributeGcnArchName + hipDeviceAttributeHdpMemFlushCntl ## Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register + hipDeviceAttributeHdpRegFlushCntl ## Address of the HDP_REG_COHERENCY_FLUSH_CNTL register + hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc ## Supports cooperative launch on multiple + ## devices with unmatched functions + hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim ## Supports cooperative launch on multiple + ## devices with unmatched grid dimensions + hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim ## Supports cooperative launch on multiple + ## devices with unmatched block dimensions + hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem ## Supports cooperative launch on multiple + ## devices with unmatched shared memories + hipDeviceAttributeIsLargeBar ## Whether it is LargeBar + hipDeviceAttributeAsicRevision ## Revision of the GPU in this device + hipDeviceAttributeCanUseStreamWaitValue ## '1' if Device supports hipStreamWaitValue32() and + ## hipStreamWaitValue64(), '0' otherwise. + hipDeviceAttributeImageSupport ## '1' if Device supports image, '0' otherwise. + hipDeviceAttributePhysicalMultiProcessorCount ## All available physical compute + ## units for the device + hipDeviceAttributeFineGrainSupport ## '1' if Device supports fine grain, '0' otherwise + hipDeviceAttributeWallClockRate ## Constant frequency of wall clock in kilohertz. + + hipDeviceAttributeAmdSpecificEnd = 19999 + hipDeviceAttributeVendorSpecificBegin = 20000 + ## Extended attributes for vendors + + HipMemAttach_flags* = enum + hipMemAttachGlobal = 0x1, ## Memory can be accessed by any stream on any device + hipMemAttachHost = 0x2, ## Memory cannot be accessed by any stream on any device + hipMemAttachSingle = 0x4 + + HipDevice* = distinct int32 ## Hip Compute Device handle + HipContext* = distinct pointer # hipCtx_t + HipModule* = distinct pointer # hipModule_t + HipFunction* = distinct pointer # hipFunction_t + HipStream* = distinct pointer # hipStream_t + HipDeviceptr* = distinct pointer # HipDeviceptr_t + ## A pointer to data on the Hip device + +type + HipDeviceProp* {.bycopy.} = object + # Generated via c2nim from HIP v6.0.2 + # The ABI seems forward compatible with reserved bytes + # + # We don't import the `hip_runtime_api.h` header + # for one less dependency during deployment, + # especially given than some distributions like Ubuntu + # split between a dev (with headers) and regular package + # and Windows path management is cumbersome. + + # Note the macro + # #define hipDeviceProp_t hipDeviceProp_tR0600 + + name*: array[256, char] + ## Device name. + uuid*: hipUUID + ## UUID of a device + luid*: array[8, byte] + ## 8-byte unique identifier. Only valid on windows + luidDeviceNodeMask*: cuint + ## LUID node mask + totalGlobalMem*: csize_t + ## Size of global memory region (in bytes). + sharedMemPerBlock*: csize_t + ## Size of shared memory region (in bytes). + regsPerBlock*: cint + ## Registers per block. + warpSize*: cint + ## Warp size. + memPitch*: csize_t + ## Maximum pitch in bytes allowed by memory copies + ## pitched memory + maxThreadsPerBlock*: cint + ## Max work items per work group or workgroup max size. + maxThreadsDim*: array[3, cint] + ## Max number of threads in each dimension (XYZ) of a block. + maxGridSize*: array[3, cint] + ## Max grid dimensions (XYZ). + clockRate*: cint + ## Max clock frequency of the multiProcessors in khz. + totalConstMem*: csize_t + ## Size of shared memory region (in bytes). + major*: cint + ## Major compute capability. On HCC, this is an approximation and features may + ## differ from CUDA CC. See the arch feature flags for portable ways to query + ## feature caps. + minor*: cint + ## Minor compute capability. On HCC, this is an approximation and features may + ## differ from CUDA CC. See the arch feature flags for portable ways to query + ## feature caps. + textureAlignment*: csize_t + ## Alignment requirement for textures + texturePitchAlignment*: csize_t + ## Pitch alignment requirement for texture references bound to + deviceOverlap*: cint + ## Deprecated. Use asyncEngineCount instead + multiProcessorCount*: cint + ## Number of multi-processors (compute units). + kernelExecTimeoutEnabled*: cint + ## Run time limit for kernels executed on the device + integrated*: cint + ## APU vs dGPU + canMapHostMemory*: cint + ## Check whether HIP can map host memory + computeMode*: cint + ## Compute mode. + maxTexture1D*: cint + ## Maximum number of elements in 1D images + maxTexture1DMipmap*: cint + ## Maximum 1D mipmap texture size + maxTexture1DLinear*: cint + ## Maximum size for 1D textures bound to linear memory + maxTexture2D*: array[2, cint] + ## Maximum dimensions (width, height) of 2D images, in image elements + maxTexture2DMipmap*: array[2, cint] + ## Maximum number of elements in 2D array mipmap of images + maxTexture2DLinear*: array[3, cint] + ## Maximum 2D tex dimensions if tex are bound to pitched memory + maxTexture2DGather*: array[2, cint] + ## Maximum 2D tex dimensions if gather has to be performed + maxTexture3D*: array[3, cint] + ## Maximum dimensions (width, height, depth) of 3D images, in image + ## elements + maxTexture3DAlt*: array[3, cint] + ## Maximum alternate 3D texture dims + maxTextureCubemap*: cint + ## Maximum cubemap texture dims + maxTexture1DLayered*: array[2, cint] + ## Maximum number of elements in 1D array images + maxTexture2DLayered*: array[3, cint] + ## Maximum number of elements in 2D array images + maxTextureCubemapLayered*: array[2, cint] + ## Maximum cubemaps layered texture dims + maxSurface1D*: cint + ## Maximum 1D surface size + maxSurface2D*: array[2, cint] + ## Maximum 2D surface size + maxSurface3D*: array[3, cint] + ## Maximum 3D surface size + maxSurface1DLayered*: array[2, cint] + ## Maximum 1D layered surface size + maxSurface2DLayered*: array[3, cint] + ## Maximum 2D layared surface size + maxSurfaceCubemap*: cint + ## Maximum cubemap surface size + maxSurfaceCubemapLayered*: array[2, cint] + ## Maximum cubemap layered surface size + surfaceAlignment*: csize_t + ## Alignment requirement for surface + concurrentKernels*: cint + ## Device can possibly execute multiple kernels concurrently. + ECCEnabled*: cint + ## Device has ECC support enabled + pciBusID*: cint + ## PCI Bus ID. + pciDeviceID*: cint + ## PCI Device ID. + pciDomainID*: cint + ## PCI Domain ID + tccDriver*: cint + ## 1:If device is Tesla device using TCC driver, else 0 + asyncEngineCount*: cint + ## Number of async engines + unifiedAddressing*: cint + ## Does device and host share unified address space + memoryClockRate*: cint + ## Max global memory clock frequency in khz. + memoryBusWidth*: cint + ## Global memory bus width in bits. + l2CacheSize*: cint + ## L2 cache size. + persistingL2CacheMaxSize*: cint + ## Device's max L2 persisting lines in bytes + maxThreadsPerMultiProcessor*: cint + ## Maximum resident threads per multi-processor. + streamPrioritiesSupported*: cint + ## Device supports stream priority + globalL1CacheSupported*: cint + ## Indicates globals are cached in L1 + localL1CacheSupported*: cint + ## Locals are cahced in L1 + sharedMemPerMultiprocessor*: csize_t + ## Amount of shared memory available per multiprocessor. + regsPerMultiprocessor*: cint + ## registers available per multiprocessor + managedMemory*: cint + ## Device supports allocating managed memory on this system + isMultiGpuBoard*: cint + ## 1 if device is on a multi-GPU board, 0 if not. + multiGpuBoardGroupID*: cint + ## Unique identifier for a group of devices on same multiboard GPU + hostNativeAtomicSupported*: cint + ## Link between host and device supports native atomics + singleToDoublePrecisionPerfRatio*: cint + ## Deprecated. CUDA only. + pageableMemoryAccess*: cint + ## Device supports coherently accessing pageable memory + ## without calling hipHostRegister on it + concurrentManagedAccess*: cint + ## Device can coherently access managed memory concurrently with + ## the CPU + computePreemptionSupported*: cint + ## Is compute preemption supported on the device + canUseHostPointerForRegisteredMem*: cint + ## Device can access host registered memory with same + ## address as the host + cooperativeLaunch*: cint + ## HIP device supports cooperative launch + cooperativeMultiDeviceLaunch*: cint + ## HIP device supports cooperative launch on multiple + ## devices + sharedMemPerBlockOptin*: csize_t + ## Per device m ax shared mem per block usable by special opt in + pageableMemoryAccessUsesHostPageTables*: cint + ## Device accesses pageable memory via the host's + ## page tables + directManagedMemAccessFromHost*: cint + ## Host can directly access managed memory on the device + ## without migration + maxBlocksPerMultiProcessor*: cint + ## Max number of blocks on CU + accessPolicyMaxWindowSize*: cint + ## Max value of access policy window + reservedSharedMemPerBlock*: csize_t + ## Shared memory reserved by driver per block + hostRegisterSupported*: cint + ## Device supports hipHostRegister + sparseHipArraySupported*: cint + ## Indicates if device supports sparse hip arrays + hostRegisterReadOnlySupported*: cint + ## Device supports using the hipHostRegisterReadOnly flag + ## with hipHostRegistger + timelineSemaphoreInteropSupported*: cint + ## Indicates external timeline semaphore support + memoryPoolsSupported*: cint + ## Indicates if device supports hipMallocAsync and hipMemPool APIs + gpuDirectRDMASupported*: cint + ## Indicates device support of RDMA APIs + gpuDirectRDMAFlushWritesOptions*: cuint + ## Bitmask to be interpreted according to + ## hipFlushGPUDirectRDMAWritesOptions + gpuDirectRDMAWritesOrdering*: cint + ## value of hipGPUDirectRDMAWritesOrdering + memoryPoolSupportedHandleTypes*: cuint + ## Bitmask of handle types support with mempool based IPC + deferredMappingHipArraySupported*: cint + ## Device supports deferred mapping HIP arrays and HIP + ## mipmapped arrays + ipcEventSupported*: cint + ## Device supports IPC events + clusterLaunch*: cint + ## Device supports cluster launch + unifiedFunctionPointers*: cint + ## Indicates device supports unified function pointers + reserved*: array[63, cint] + ## CUDA Reserved. + hipReserved*: array[32, cint] + ## Reserved for adding new entries for HIP/CUDA. + ## HIP Only struct members + gcnArchName*: array[256, char] + ## AMD GCN Arch Name. HIP Only. + maxSharedMemoryPerMultiProcessor*: csize_t + ## Maximum Shared Memory Per CU. HIP Only. + clockInstructionRate*: cint + ## Frequency in khz of the timer used by the device-side "clock*" + ## instructions. New for HIP. + arch*: HipDeviceArch + ## Architectural feature flags. New for HIP. + hdpMemFlushCntl*: ptr cuint + ## Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register + hdpRegFlushCntl*: ptr cuint + ## Addres of HDP_REG_COHERENCY_FLUSH_CNTL register + cooperativeMultiDeviceUnmatchedFunc*: cint + ## HIP device supports cooperative launch on + ## multiple + ## devices with unmatched functions + cooperativeMultiDeviceUnmatchedGridDim*: cint + ## HIP device supports cooperative launch on + ## multiple + ## devices with unmatched grid dimensions + cooperativeMultiDeviceUnmatchedBlockDim*: cint + ## HIP device supports cooperative launch on + ## multiple + ## devices with unmatched block dimensions + cooperativeMultiDeviceUnmatchedSharedMem*: cint + ## HIP device supports cooperative launch on + ## multiple + ## devices with unmatched shared memories + isLargeBar*: cint + ## 1: if it is a large PCI bar device, else 0 + asicRevision*: cint + ## Revision of the GPU in this device + + hipUUID* {.bycopy.} = object + bytes*: array[16, byte] + + HipDeviceArch* {.bycopy.} = object + ## 32-bit Atomics + hasGlobalInt32Atomics* {.bitsize: 1.}: cuint + ## 32-bit integer atomics for global memory. + hasGlobalFloatAtomicExch* {.bitsize: 1.}: cuint + ## 32-bit float atomic exch for global memory. + hasSharedInt32Atomics* {.bitsize: 1.}: cuint + ## 32-bit integer atomics for shared memory. + hasSharedFloatAtomicExch* {.bitsize: 1.}: cuint + ## 32-bit float atomic exch for shared memory. + hasFloatAtomicAdd* {.bitsize: 1.}: cuint + ## 32-bit float atomic add in global and shared memory. + ## 64-bit Atomics + hasGlobalInt64Atomics* {.bitsize: 1.}: cuint + ## 64-bit integer atomics for global memory. + hasSharedInt64Atomics* {.bitsize: 1.}: cuint + ## 64-bit integer atomics for shared memory. + ## Doubles + hasDoubles* {.bitsize: 1.}: cuint + ## Double-precision floating point. + ## Warp cross-lane operations + hasWarpVote* {.bitsize: 1.}: cuint + ## Warp vote instructions (__any, __all). + hasWarpBallot* {.bitsize: 1.}: cuint + ## Warp ballot instructions (__ballot). + hasWarpShuffle* {.bitsize: 1.}: cuint + ## Warp shuffle operations. (__shfl_*). + hasFunnelShift* {.bitsize: 1.}: cuint + ## Funnel two words into one with shift&mask caps. + ## Sync + hasThreadFenceSystem* {.bitsize: 1.}: cuint + ## __threadfence_system. + hasSyncThreadsExt* {.bitsize: 1.}: cuint + ## __syncthreads_count, syncthreads_and, syncthreads_or. + ## Misc + hasSurfaceFuncs* {.bitsize: 1.}: cuint + ## Surface functions. + has3dGrid* {.bitsize: 1.}: cuint + ## Grid and group dims are 3D (rather than 2D). + hasDynamicParallelism* {.bitsize: 1.}: cuint + ## Dynamic parallelism. + +proc hipGetDeviceProperties*(prop: var HipDeviceProp, ordinal: int32): HipError {. + noconv, importc: "hipGetDevicePropertiesR0600", dynlib: libAmdHip.} + # Note the macro + # `#define hipGetDeviceProperties hipGetDevicePropertiesR0600` + +{.push noconv, importc, dynlib: libAmdHip.} + +proc hipInit*(flags: uint32): HipError + +proc hipGetDeviceCount*(count: var int32): HipError +proc hipDeviceGet*(device: var HipDevice, ordinal: int32): HipError +proc hipDeviceGetName*(name: ptr char, len: int32, dev: HipDevice): HipError +proc hipDeviceGetAttribute*(r: var int32, attrib: HipDeviceAttribute, dev: HipDevice): HipError + +proc hipCtxCreate*(ctx: var HipContext, flags: uint32, dev: HipDevice): HipError +proc hipCtxDestroy*(ctx: HipContext): HipError +proc hipCtxSynchronize*(ctx: HipContext): HipError + +proc hipModuleLoadData(module: var HipModule, code_object: pointer): HipError {.used.} +proc hipModuleUnload*(module: HipModule): HipError +proc hipModuleGetFunction(kernel: var HipFunction, module: HipModule, fnName: ptr char): HipError {.used.} + +proc hipModuleLaunchKernel( + kernel: HipFunction, + gridDimX, gridDimY, gridDimZ: uint32, + blockDimX, blockDimY, blockDimZ: uint32, + sharedMemBytes: uint32, + stream: HipStream, + kernelParams: ptr pointer, + extra: ptr pointer + ): HipError {.used.} + +proc hipMalloc*(devptr: var HipDeviceptr, size: csize_t): HipError +proc hipMallocManaged*(devptr: var HipDeviceptr, size: csize_t, flags: Flag[HipMemAttach_flags]): HipError +proc hipFree*(devptr: HipDeviceptr): HipError +proc hipMemcpyHtoD*(dst: HipDeviceptr, src: pointer, size: csize_t): HipError +proc hipMemcpyDtoH*(dst: pointer, src: HipDeviceptr, size: csize_t): HipError + +{.pop.} # {.push importc, dynlib: "libamdhip64.so".} + +# ------------------------------------------------------------------------------ +# Sanity check + +when isMainModule: + + template check*(status: HipError) = + ## Check the status code of a Hip operation + ## Exit program with error if failure + + let code = status # ensure that the input expression is evaluated once only + if code != hipSuccess: + writeStackTrace() + stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n') + quit 1 + + proc main*() = + var props: HipDeviceProp + var device: cint = 0 + check hipGetDeviceProperties(props, device) + echo "warpSize: ", props.warpSize + echo "GCN Architecture: ", props.gcnArchName + + main() diff --git a/constantine/platforms/llvm/bindings/c_abi.nim b/constantine/platforms/abis/c_abi.nim similarity index 99% rename from constantine/platforms/llvm/bindings/c_abi.nim rename to constantine/platforms/abis/c_abi.nim index 821b84eda..7e16291f1 100644 --- a/constantine/platforms/llvm/bindings/c_abi.nim +++ b/constantine/platforms/abis/c_abi.nim @@ -7,7 +7,7 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import std/macros -import ../../ast_rebuilder +import constantine/platforms/ast_rebuilder # ############################################################ # diff --git a/constantine/platforms/llvm/bindings/llvm_abi.nim b/constantine/platforms/abis/llvm_abi.nim similarity index 73% rename from constantine/platforms/llvm/bindings/llvm_abi.nim rename to constantine/platforms/abis/llvm_abi.nim index 981bcb9ac..7fcf342fb 100644 --- a/constantine/platforms/llvm/bindings/llvm_abi.nim +++ b/constantine/platforms/abis/llvm_abi.nim @@ -8,10 +8,7 @@ import ./c_abi -{.passc: gorge("llvm-config --cflags").} -{.passl: gorge("llvm-config --libs").} -const libLLVM = gorge("llvm-config --libnames") - +const libLLVM = "libLLVM-(16|17|18).so" static: echo "[Constantine] Using library " & libLLVM # ############################################################ @@ -133,6 +130,11 @@ proc initializeNVPTXAsmPrinter() {.importc: "LLVMInitializeNVPTXAsmPrinter".} proc initializeNVPTXTarget() {.importc: "LLVMInitializeNVPTXTarget".} proc initializeNVPTXTargetInfo() {.importc: "LLVMInitializeNVPTXTargetInfo".} proc initializeNVPTXTargetMC() {.importc: "LLVMInitializeNVPTXTargetMC".} + +proc initializeAMDGPUAsmPrinter() {.importc: "LLVMInitializeAMDGPUAsmPrinter".} +proc initializeAMDGPUTarget() {.importc: "LLVMInitializeAMDGPUTarget".} +proc initializeAMDGPUTargetInfo() {.importc: "LLVMInitializeAMDGPUTargetInfo".} +proc initializeAMDGPUTargetMC() {.importc: "LLVMInitializeAMDGPUTargetMC".} {.pop.} proc getTargetFromName*(name: cstring): TargetRef {.importc: "LLVMGetTargetFromName".} @@ -287,6 +289,254 @@ proc getElementType*(arrayOrVectorTy: TypeRef): TypeRef {.importc: "LLVMGetEleme # Functions # ------------------------------------------------------------ +type + CallingConvention {.size: sizeof(cuint).} = enum + # The default llvm calling convention, compatible with C. This convention + # is the only one that supports varargs calls. As with typical C calling + # conventions, the callee/caller have to tolerate certain amounts of + # prototype mismatch. + C = 0, + + # Generic LLVM calling conventions. None of these support varargs calls, + # and all assume that the caller and callee prototype exactly match. + + # Attempts to make calls as fast as possible (e.g. by passing things in + # registers). + Fast = 8, + + # Attempts to make code in the caller as efficient as possible under the + # assumption that the call is not commonly executed. As such, these calls + # often preserve all registers so that the call does not break any live + # ranges in the caller side. + Cold = 9, + + # Used by the Glasgow Haskell Compiler (GHC). + GHC = 10, + + # Used by the High-Performance Erlang Compiler (HiPE). + HiPE = 11, + + # OBSOLETED - Used for stack based JavaScript calls + # WebKit_JS = 12, + + # Used for dynamic register based calls (e.g. stackmap and patchpoint + # intrinsics). + AnyReg = 13, + + # Used for runtime calls that preserves most registers. + PreserveMost = 14, + + # Used for runtime calls that preserves (almost) all registers. + PreserveAll = 15, + + # Calling convention for Swift. + Swift = 16, + + # Used for access functions. + CXX_FAST_TLS = 17, + + # Attemps to make calls as fast as possible while guaranteeing that tail + # call optimization can always be performed. + Tail = 18, + + # Special calling convention on Windows for calling the Control Guard + # Check ICall funtion. The function takes exactly one argument (address of + # the target function) passed in the first argument register, and has no + # return value. All register values are preserved. + CFGuard_Check = 19, + + # This follows the Swift calling convention in how arguments are passed + # but guarantees tail calls will be made by making the callee clean up + # their stack. + SwiftTail = 20, + + # Used for runtime calls that preserves none general registers. + PreserveNone = 21, + + # This is the start of the target-specific calling conventions, e.g. + # fastcall and thiscall on X86. + # FirstTargetCC = 64, + + # stdcall is mostly used by the Win32 API. It is basically the same as the + # C convention with the difference in that the callee is responsible for + # popping the arguments from the stack. + X86_StdCall = 64, + + # 'fast' analog of X86_StdCall. Passes first two arguments in ECX:EDX + # registers, others - via stack. Callee is responsible for stack cleaning. + X86_FastCall = 65, + + # ARM Procedure Calling Standard (obsolete, but still used on some + # targets). + ARM_APCS = 66, + + # ARM Architecture Procedure Calling Standard calling convention (aka + # EABI). Soft float variant. + ARM_AAPCS = 67, + + # Same as ARM_AAPCS, but uses hard floating point ABI. + ARM_AAPCS_VFP = 68, + + # Used for MSP430 interrupt routines. + MSP430_INTR = 69, + + # Similar to X86_StdCall. Passes first argument in ECX, others via stack. + # Callee is responsible for stack cleaning. MSVC uses this by default for + # methods in its ABI. + X86_ThisCall = 70, + + # Call to a PTX kernel. Passes all arguments in parameter space. + PTX_Kernel = 71, + + # Call to a PTX device function. Passes all arguments in register or + # parameter space. + PTX_Device = 72, + + # Used for SPIR non-kernel device functions. No lowering or expansion of + # arguments. Structures are passed as a pointer to a struct with the + # byval attribute. Functions can only call SPIR_FUNC and SPIR_KERNEL + # functions. Functions can only have zero or one return values. Variable + # arguments are not allowed, except for printf. How arguments/return + # values are lowered are not specified. Functions are only visible to the + # devices. + SPIR_FUNC = 75, + + # Used for SPIR kernel functions. Inherits the restrictions of SPIR_FUNC, + # except it cannot have non-void return values, it cannot have variable + # arguments, it can also be called by the host or it is externally + # visible. + SPIR_KERNEL = 76, + + # Used for Intel OpenCL built-ins. + Intel_OCL_BI = 77, + + # The C convention as specified in the x86-64 supplement to the System V + # ABI, used on most non-Windows systems. + X86_64_SysV = 78, + + # The C convention as implemented on Windows/x86-64 and AArch64. It + # differs from the more common \c X86_64_SysV convention in a number of + # ways, most notably in that XMM registers used to pass arguments are + # shadowed by GPRs, and vice versa. On AArch64, this is identical to the + # normal C (AAPCS) calling convention for normal functions, but floats are + # passed in integer registers to variadic functions. + Win64 = 79, + + # MSVC calling convention that passes vectors and vector aggregates in SSE + # registers. + X86_VectorCall = 80, + + # Placeholders for HHVM calling conventions (deprecated, removed). + DUMMY_HHVM = 81, + DUMMY_HHVM_C = 82, + + # x86 hardware interrupt context. Callee may take one or two parameters, + # where the 1st represents a pointer to hardware context frame and the 2nd + # represents hardware error code, the presence of the later depends on the + # interrupt vector taken. Valid for both 32- and 64-bit subtargets. + X86_INTR = 83, + + # Used for AVR interrupt routines. + AVR_INTR = 84, + + # Used for AVR signal routines. + AVR_SIGNAL = 85, + + # Used for special AVR rtlib functions which have an "optimized" + # convention to preserve registers. + AVR_BUILTIN = 86, + + # Used for Mesa vertex shaders, or AMDPAL last shader stage before + # rasterization (vertex shader if tessellation and geometry are not in + # use, or otherwise copy shader if one is needed). + AMDGPU_VS = 87, + + # Used for Mesa/AMDPAL geometry shaders. + AMDGPU_GS = 88, + + # Used for Mesa/AMDPAL pixel shaders. + AMDGPU_PS = 89, + + # Used for Mesa/AMDPAL compute shaders. + AMDGPU_CS = 90, + + # Used for AMDGPU code object kernels. + AMDGPU_KERNEL = 91, + + # Register calling convention used for parameters transfer optimization + X86_RegCall = 92, + + # Used for Mesa/AMDPAL hull shaders (= tessellation control shaders). + AMDGPU_HS = 93, + + # Used for special MSP430 rtlib functions which have an "optimized" + # convention using additional registers. + MSP430_BUILTIN = 94, + + # Used for AMDPAL vertex shader if tessellation is in use. + AMDGPU_LS = 95, + + # Used for AMDPAL shader stage before geometry shader if geometry is in + # use. So either the domain (= tessellation evaluation) shader if + # tessellation is in use, or otherwise the vertex shader. + AMDGPU_ES = 96, + + # Used between AArch64 Advanced SIMD functions + AArch64_VectorCall = 97, + + # Used between AArch64 SVE functions + AArch64_SVE_VectorCall = 98, + + # For emscripten __invoke_* functions. The first argument is required to + # be the function ptr being indirectly called. The remainder matches the + # regular calling convention. + WASM_EmscriptenInvoke = 99, + + # Used for AMD graphics targets. + AMDGPU_Gfx = 100, + + # Used for M68k interrupt routines. + M68k_INTR = 101, + + # Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15. + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 = 102, + + # Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15. + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 = 103, + + # Used on AMDGPUs to give the middle-end more control over argument + # placement. + AMDGPU_CS_Chain = 104, + + # Used on AMDGPUs to give the middle-end more control over argument + # placement. Preserves active lane values for input VGPRs. + AMDGPU_CS_ChainPreserve = 105, + + # Used for M68k rtd-based CC (similar to X86's stdcall). + M68k_RTD = 106, + + # Used by GraalVM. Two additional registers are reserved. + GRAAL = 107, + + # Calling convention used in the ARM64EC ABI to implement calls between + # x64 code and thunks. This is basically the x64 calling convention using + # ARM64 register names. The first parameter is mapped to x9. + ARM64EC_Thunk_X64 = 108, + + # Calling convention used in the ARM64EC ABI to implement calls between + # ARM64 code and thunks. This is just the ARM64 calling convention, + # except that the first parameter is mapped to x9. + ARM64EC_Thunk_Native = 109, + + # Calling convention used for RISC-V V-extension. + RISCV_VectorCall = 110, + + # Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15. + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 = 111, + + # The highest possible ID. Must be some 2^k - 1. + MaxID = 1023 + proc function_t*( returnType: TypeRef, paramTypes: openArray[TypeRef], @@ -299,6 +549,9 @@ proc addFunction*(m: ModuleRef, name: cstring, ty: TypeRef): ValueRef {.importc: proc getReturnType*(functionTy: TypeRef): TypeRef {.importc: "LLVMGetReturnType".} proc countParamTypes*(functionTy: TypeRef): uint32 {.importc: "LLVMCountParamTypes".} +proc getCallingConvention*(function: ValueRef): CallingConvention {.importc: "LLVMGetFunctionCallConv".} +proc setCallingConvention*(function: ValueRef, cc: CallingConvention) {.importc: "LLVMSetFunctionCallConv".} + # ############################################################ # # Values diff --git a/constantine/platforms/llvm/bindings/nvidia_abi.nim b/constantine/platforms/abis/nvidia_abi.nim similarity index 99% rename from constantine/platforms/llvm/bindings/nvidia_abi.nim rename to constantine/platforms/abis/nvidia_abi.nim index ae37ed713..02dd8808f 100644 --- a/constantine/platforms/llvm/bindings/nvidia_abi.nim +++ b/constantine/platforms/abis/nvidia_abi.nim @@ -20,8 +20,9 @@ import ./c_abi # # ############################################################ -static: echo "[Constantine] Using library libcuda.so" -{.passl: "-L/opt/cuda/lib64 -lcuda".} +const libPath = "/opt/cuda/lib64/" # For now, only support Linux +static: echo "[Constantine] Will search Cuda runtime in $LD_LIBRARY_PATH and " & libPath & "libcuda.so" +const libCuda = "(libcuda.so|" & libPath & "libcuda.so)" # Cuda offers 2 APIs: # - cuda.h the driver API @@ -482,7 +483,7 @@ type CUstream* = distinct pointer CUdeviceptr* = distinct pointer -{.push noconv, importc, dynlib: "libcuda.so".} +{.push noconv, importc, dynlib: libCuda.} proc cuInit*(flags: uint32): CUresult diff --git a/constantine/platforms/llvm/llvm.nim b/constantine/platforms/llvm/llvm.nim index c064897bf..d222a306d 100644 --- a/constantine/platforms/llvm/llvm.nim +++ b/constantine/platforms/llvm/llvm.nim @@ -6,7 +6,7 @@ # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. -import ./bindings/llvm_abi {.all.} +import constantine/platforms/abis/llvm_abi {.all.} export llvm_abi # ############################################################ @@ -48,7 +48,7 @@ proc toBitcode*(m: ModuleRef): seq[byte] = copyMem(result[0].addr, mb.getBufferStart(), len) mb.dispose() -template verify*(module: ModuleRef, failureAction: VerifierFailureAction) = +proc verify*(module: ModuleRef, failureAction: VerifierFailureAction) = ## Verify the IR code in a module var errMsg: LLVMstring let err = bool verify(module, failureAction, errMsg) @@ -68,7 +68,7 @@ proc getIdentifier*(module: ModuleRef): string = # Target # ------------------------------------------------------------ -template toTarget*(triple: cstring): TargetRef = +proc toTarget*(triple: cstring): TargetRef = var target: TargetRef var errMsg: LLVMstring let err = bool triple.getTargetFromTriple(target, errMsg) @@ -94,10 +94,16 @@ proc initializeFullNVPTXTarget* {.inline.} = initializeNVPTXTargetMC() initializeNVPTXAsmPrinter() +proc initializeFullAMDGPUTarget* {.inline.} = + initializeAMDGPUTargetInfo() + initializeAMDGPUTarget() + initializeAMDGPUTargetMC() + initializeAMDGPUAsmPrinter() + # Execution Engine # ------------------------------------------------------------ -template createJITCompilerForModule*( +proc createJITCompilerForModule*( engine: var ExecutionEngineRef, module: ModuleRef, optLevel: uint32) = @@ -112,7 +118,7 @@ template createJITCompilerForModule*( # Target Machine # ------------------------------------------------------------ -template emitToFile*(t: TargetMachineRef, m: ModuleRef, +proc emitToFile*(t: TargetMachineRef, m: ModuleRef, fileName: string, codegen: CodeGenFileType) = var errMsg: LLVMstring let err = bool targetMachineEmitToFile(t, m, cstring(fileName), codegen, errMsg) @@ -122,7 +128,7 @@ template emitToFile*(t: TargetMachineRef, m: ModuleRef, errMsg.dispose() quit 1 -template emitToString*(t: TargetMachineRef, m: ModuleRef, codegen: CodeGenFileType): string = +proc emitTo*[T: string or seq[byte]](t: TargetMachineRef, m: ModuleRef, codegen: CodeGenFileType): T = ## Codegen to string var errMsg: LLVMstring var mb: MemoryBufferRef @@ -133,10 +139,9 @@ template emitToString*(t: TargetMachineRef, m: ModuleRef, codegen: CodeGenFileTy errMsg.dispose() quit 1 let len = mb.getBufferSize() - var emitted = newString(len) - copyMem(emitted[0].addr, mb.getBufferStart(), len) + result.setLen(len) + copyMem(result[0].addr, mb.getBufferStart(), len) mb.dispose() - emitted # Builder # ------------------------------------------------------------ diff --git a/research/codegen/x86_instr.nim b/research/codegen/x86_instr.nim index 1e6211df1..a4a192194 100644 --- a/research/codegen/x86_instr.nim +++ b/research/codegen/x86_instr.nim @@ -7,7 +7,7 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - constantine/platforms/llvm/bindings/c_abi, + constantine/platforms/abis/c_abi, constantine/platforms/llvm/llvm, constantine/platforms/primitives, constantine/math_compiler/ir, diff --git a/research/codegen/x86_poc.nim b/research/codegen/x86_poc.nim index 307826158..c5c376fe4 100644 --- a/research/codegen/x86_poc.nim +++ b/research/codegen/x86_poc.nim @@ -156,7 +156,7 @@ when isMainModule: echo "=========================================" echo "Assembly\n" - echo machine.emitToString(asy.module, AssemblyFile) + echo machine.emitTo[:string](asy.module, AssemblyFile) echo "=========================================" # Output diff --git a/tests/gpu/hello_world_amdgpu.nim b/tests/gpu/hello_world_amdgpu.nim new file mode 100644 index 000000000..c0f57d23c --- /dev/null +++ b/tests/gpu/hello_world_amdgpu.nim @@ -0,0 +1,211 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + constantine/platforms/llvm/llvm, + constantine/math_compiler/codegen_amdgpu + +echo "AMD GPU JIT compiler Hello World" + +# Docs: +# - https://rocm.docs.amd.com/projects/llvm-project/en/latest/reference/rocmcc.html +# - https://llvm.org/docs/AMDGPUUsage.html + + +proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKernelName: string) = + + # ###################################### + # Metadata + + const triple = "amdgcn-amd-amdhsa" + + const datalayout1 {.used.} = + "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-" & + "i64:64-" & + "v16:16-v24:32-" & + "v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-" & + "n32:64-S32-A5-G1-ni:7" + + const datalayout2 = + "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-" & + "i64:64-" & + "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-" & + "n32:64-S32-A5-G1-ni:7:8" + + + # ###################################### + # LLVM IR codegen + + module.setTarget(triple) + module.setDataLayout(datalayout2) + let i128 = ctx.int128_t() + let void_t = ctx.void_t() + + let builder = ctx.createBuilder() + defer: builder.dispose() + + block: + let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false)) + let addKernel = module.addFunction(addKernelName, addType) + let blck = ctx.appendBasicBlock(addKernel, "addBody") + builder.positionAtEnd(blck) + let r = addKernel.getParam(0) + let a = addKernel.getParam(1) + let b = addKernel.getParam(2) + let sum = builder.add(a, b, "sum") + builder.store(sum, r) + builder.retVoid() + + module.wrapInCallableHipKernel((addType, addKernel)) + + block: + let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false)) + let mulKernel = module.addFunction(mulKernelName, mulType) + let blck = ctx.appendBasicBlock(mulKernel, "mulBody") + builder.positionAtEnd(blck) + let r = mulKernel.getParam(0) + let a = mulKernel.getParam(1) + let b = mulKernel.getParam(2) + let prod = builder.mul(a, b, "prod") + builder.store(prod, r) + builder.retVoid() + + module.wrapInCallableHipKernel((mulType, mulKernel)) + + module.verify(AbortProcessAction) + + block: + echo "=================" + echo "LLVM IR output" + echo $module + echo "=================" + +func toHex*(a: uint64): string = + const hexChars = "0123456789abcdef" + const L = 2*sizeof(uint64) + result = newString(L) + var a = a + for j in countdown(result.len-1, 0): + result[j] = hexChars[a and 0xF] + a = a shr 4 + +func toString*(a: openArray[uint64]): string = + result = "0x" + for i in countdown(a.len-1, 0): + result.add toHex(a[i]) + +proc getHipKernel(hipMod: HipModule, fnName: string): HipFunction = + check hipModuleGetFunction(result, hipMod, fnName & "_public") + +proc main() = + + ####################################### + # GPU init + let hipDevice = hipDeviceInit() + + ####################################### + # LLVM IR + let ctx = createContext() + let module = ctx.createModule("test_nnvm") + + let addKernelName = "addKernel" + let mulKernelName = "mulKernel" + + writeExampleAddMul(ctx, module, addKernelName, mulKernelName) + module.verify(AbortProcessAction) + + ####################################### + # Compilation + + initializeFullAMDGPUTarget() + const triple = "amdgcn-amd-amdhsa" + let gcnArchName = getGcnArchName(deviceId = 0) + + let machine = createTargetMachine( + target = toTarget(triple), + triple = triple, + cpu = cstring(gcnArchName), + features = "", + level = CodeGenLevelAggressive, + reloc = RelocDefault, + codeModel = CodeModelDefault + ) + + let objectCode = machine.emitTo[:seq[byte]](module, ObjectFile) + let assembly = machine.emitTo[:string](module, AssemblyFile) + + module.dispose() + ctx.dispose() + + block: + echo "=================" + echo "AMD GCN output" + echo $assembly + echo "=================" + + let exeCode = objectCode.linkAmdGpu(gcnArchName) + + ####################################### + # GPU JIT + var hipCtx: HipContext + var hipMod: HipModule + check hipCtxCreate(hipCtx, 0, hipDevice) + check hipModuleLoadData(hipMod, exeCode[0].addr) + let addKernel = hipMod.getHipKernel(addKernelName) + let mulKernel = hipMod.getHipKernel(mulKernelName) + + + ####################################### + # Kernel launch + var r{.noInit.}, a, b: array[2, uint64] + + a[1] = 0x00000000000001FF'u64; a[0] = 0xFFFFFFFFFFFFFFFF'u64 + b[1] = 0x0000000000000000'u64; b[0] = 0x0010000000000000'u64 + + echo "r: ", r.toString() + echo "a: ", a.toString() + echo "b: ", b.toString() + + var rGPU: HipDeviceptr + check hipMalloc(rGPU, csize_t sizeof(r)) + + let params = [pointer(rGPU.addr), pointer(a.addr), pointer(b.addr)] + + check hipModuleLaunchKernel( + addKernel, + 1, 1, 1, + 1, 1, 1, + 0, HipStream(nil), + params[0].unsafeAddr, nil) + + check hipMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r)) + echo "a+b: ", r.toString() + + check hipModuleLaunchKernel( + mulKernel, + 1, 1, 1, + 1, 1, 1, + 0, HipStream(nil), + params[0].unsafeAddr, nil) + + check hipMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r)) + echo "a*b: ", r.toString() + + ####################################### + # Cleanup + + check hipFree(rGPU) + rGPU = HipDeviceptr(nil) + + check hipModuleUnload(hipMod) + hipMod = HipModule(nil) + + check hipCtxDestroy(hipCtx) + hipCtx = HipContext(nil) + +main() diff --git a/tests/gpu/hello_world_nvidia.nim b/tests/gpu/hello_world_nvidia.nim index 1f81ce54f..f2a069cfc 100644 --- a/tests/gpu/hello_world_nvidia.nim +++ b/tests/gpu/hello_world_nvidia.nim @@ -6,7 +6,9 @@ # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). # at your option. This file may not be copied, modified, or distributed except according to those terms. -import constantine/platforms/llvm/[llvm, nvidia, bindings/c_abi] +import + constantine/platforms/llvm/llvm, + constantine/math_compiler/codegen_nvidia # ############################################################ # @@ -136,7 +138,7 @@ proc ptxCodegenViaLlvmNvptx(module: ModuleRef, sm: tuple[major, minor: int32]): codeModel = CodeModelDefault ) - machine.emitToString(module, AssemblyFile) + machine.emitTo[:string](module, AssemblyFile) # ############################################################ # @@ -153,7 +155,11 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe const triple = "nvptx64-nvidia-cuda" # Datalayout for NVVM IR 1.8 (CUDA 11.6) - const datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + const datalayout = + "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-" & + "f32:32:32-f64:64:64-" & + "v16:16:16-v32:32:32-v64:64:64-v128:128:128-" & + "n16:32:64" # ###################################### # LLVM IR codegen @@ -178,7 +184,7 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe builder.store(sum, r) builder.retVoid() - module.setCallableCudaKernel((addType, addKernel)) + module.wrapInCallableCudaKernel((addType, addKernel)) block: let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false)) @@ -192,7 +198,7 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe builder.store(prod, r) builder.retVoid() - module.setCallableCudaKernel((mulType, mulKernel)) + module.wrapInCallableCudaKernel((mulType, mulKernel)) module.verify(AbortProcessAction) diff --git a/tests/gpu/t_nvidia_fp.nim b/tests/gpu/t_nvidia_fp.nim index 1384aa92d..b3aa873fe 100644 --- a/tests/gpu/t_nvidia_fp.nim +++ b/tests/gpu/t_nvidia_fp.nim @@ -39,21 +39,21 @@ proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Algebra, wor proc genFieldAddPTX(asy: Assembler_LLVM, cm: CurveMetadata) = let fpAdd = asy.field_add_gen(cm, fp) - asy.module.setCallableCudaKernel(fpAdd) + asy.module.wrapInCallableCudaKernel(fpAdd) let frAdd = asy.field_add_gen(cm, fr) - asy.module.setCallableCudaKernel(frAdd) + asy.module.wrapInCallableCudaKernel(frAdd) proc genFieldSubPTX(asy: Assembler_LLVM, cm: CurveMetadata) = let fpSub = asy.field_sub_gen(cm, fp) - asy.module.setCallableCudaKernel(fpSub) + asy.module.wrapInCallableCudaKernel(fpSub) let frSub = asy.field_sub_gen(cm, fr) - asy.module.setCallableCudaKernel(frSub) + asy.module.wrapInCallableCudaKernel(frSub) proc genFieldMulPTX(asy: Assembler_LLVM, cm: CurveMetadata) = let fpMul = asy.field_mul_gen(cm, fp) - asy.module.setCallableCudaKernel(fpMul) + asy.module.wrapInCallableCudaKernel(fpMul) let frMul = asy.field_mul_gen(cm, fr) - asy.module.setCallableCudaKernel(frMul) + asy.module.wrapInCallableCudaKernel(frMul) # Init LLVM # -------------------------