Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nvidia target via NVRTC and Nim ↦ CUDA DSL #487

Draft
wants to merge 60 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
f8eadcc
commit the initial Nim ⇒ CUDA DSL, NVRTC & CUDA execution helpers
Vindaar Nov 29, 2024
19ac26e
[tests/examples] add example for a BigInt modular addition
Vindaar Nov 29, 2024
462c9e8
remove `nimcuda` dependency, wrap everything we need manually
Vindaar Feb 18, 2025
cb10aaa
merge `execCuda` logic for LLVM & NVRTC
Vindaar Feb 18, 2025
9f9b785
add `quitOnFailure` for the `check` calls to avoid regression
Vindaar Feb 18, 2025
0ae1fbe
[tests] turn big int `modadd` example into real test
Vindaar Feb 18, 2025
dd074f2
copy `libpaths.nim` over from nimcuda
Vindaar Feb 19, 2025
38e592f
[cuda] add partial support for `const` in CUDA generator
Vindaar Feb 20, 2025
6ec730a
[cuda] fix minor issue in `if` statements in CUDA generator
Vindaar Feb 20, 2025
772899a
[cuda] add support for named blocks
Vindaar Feb 20, 2025
9164de5
[cuda] add support for `bool`
Vindaar Feb 20, 2025
ac7489e
[cuda] remove unnecessary semicolon
Vindaar Feb 20, 2025
3db25f1
[cuda] add support for `{.volatile.}` variables
Vindaar Feb 20, 2025
17efeb5
[nvrtc] add `modadd`, `modsub`, `mtymul` implementations using inline…
Vindaar Feb 20, 2025
016d44c
[cuda] support basic type conversions
Vindaar Feb 21, 2025
4a8224a
[cuda] support `var` parameters in procs
Vindaar Feb 21, 2025
14cb2a8
[cuda] make sure proc body is a block
Vindaar Feb 21, 2025
6bede35
[cuda] support boolean / bitwise AND/OR and XOR, NOT
Vindaar Feb 21, 2025
8acae98
[cuda] support int32 literals
Vindaar Feb 21, 2025
538acb6
[cuda] handle prefix `not`
Vindaar Feb 21, 2025
f0328e1
[cuda] make sure to pass `array` types by pointer instead of copy
Vindaar Feb 21, 2025
5ae697b
[nvrtc] add more helpers, add TODO to investigate `slct` calls
Vindaar Feb 21, 2025
609fc46
[nvrtc] add many more field arithmetic / bigint operations
Vindaar Feb 21, 2025
3db60be
[tests] add test to pass by pointer and `var`
Vindaar Feb 21, 2025
96260f0
[tests] add test case for modadd/sub/mtymul
Vindaar Feb 21, 2025
cb525b3
[tests] add basic test cases for all new NVRTC operations
Vindaar Feb 21, 2025
c91904f
[tests] update modadd/sub, mtymul test for new `getFieldModulus`
Vindaar Feb 21, 2025
8599327
add BabyBear field
Vindaar Feb 24, 2025
9022e6c
[nvrtc] handle `mtymul` for fields with 1 limb
Vindaar Feb 24, 2025
aef1a21
[cuda] support nested array types, unpack generic instantiatons
Vindaar Feb 24, 2025
a55862f
[cuda] correctly generate ptr to array & ptr to array return types
Vindaar Feb 24, 2025
e4dafb4
[cuda] automatically generate `memcpy` for static array types
Vindaar Feb 25, 2025
6f35b3c
[nvrtc] use `const` for field modulus and other CT constants
Vindaar Feb 26, 2025
49524ff
[cuda] extend error message for non copyable inputs
Vindaar Feb 26, 2025
1cc222a
[cuda] disable passStructByPointer also for CUDA
Vindaar Feb 26, 2025
66f9cda
[cuda] add `{.nimonly.}` pragma one can use in `cuda` block
Vindaar Feb 26, 2025
d509583
[cuda] better logic for detection of type names
Vindaar Feb 26, 2025
3828a80
[cuda] support `const` by mapping it to a `__constant__`
Vindaar Feb 26, 2025
0ab26ae
[cuda] allow type determination from array literal
Vindaar Feb 26, 2025
0f854f7
[nvrtc] get rid of complexity with custom uint32 constants
Vindaar Feb 26, 2025
f54e59e
[staticFor] add stepped variant of `staticFor`
Vindaar Feb 26, 2025
f36f161
[cuda] support func, discard and command nnkCommand
Vindaar Feb 27, 2025
8780afe
[cuda] better handle required semicolons
Vindaar Feb 27, 2025
763f464
[cuda] extract type from `getType` for execution helper
Vindaar Mar 3, 2025
3d59fb9
[cuda] special case `CUdeviceptr` as a type that *must not* be copied
Vindaar Mar 3, 2025
ff6d1f2
[cuda] allow passing in shared memory size for a kernel
Vindaar Mar 3, 2025
3604775
[nvidia ABI] wrap cuModuleGetGlobal, cudaMemcpyKind and a couple more
Vindaar Mar 3, 2025
b907baf
[cuda] support while loops
Vindaar Mar 3, 2025
304373e
[cuda] support void pointers and `nil` literals
Vindaar Mar 3, 2025
b84c27d
[cuda] refactor out module loading from execution
Vindaar Mar 3, 2025
3fe306c
[cuda] store PTX before echoeing it
Vindaar Mar 3, 2025
8b4873e
[cuda] add `copyToSymbol` helper to copy to constant symbol in CUDA code
Vindaar Mar 3, 2025
529539e
[cuda] generalize `volatile` annotation to support other pragmas
Vindaar Mar 3, 2025
8dcd58f
[cuda] `cudaName` pragma for custom name for a proc, eg __syncthreads
Vindaar Mar 3, 2025
0a1da24
[cuda] support float literals
Vindaar Mar 3, 2025
f4c71a6
[cuda] map arrays of explicit length 0 to `[]` arrays in CUDA
Vindaar Mar 3, 2025
aabd3bd
[cuda] explicitly support constants, mapped to `__constant__`
Vindaar Mar 3, 2025
eb99ff6
[cuda] minor cleanup
Vindaar Mar 3, 2025
bc5b2f0
[cuda] add `gridDim`, `cuExtern` and `share` + device malloc/free
Vindaar Mar 3, 2025
0900bc0
force compilation with `-d:CTT_32` for the moment
Vindaar Mar 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
315 changes: 26 additions & 289 deletions constantine/math_compiler/codegen_nvidia.nim
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,6 @@ export
# Cuda Driver API
# ------------------------------------------------------------

template check*(status: CUresult, quitOnFailure = true) =
## Check the status code of a CUDA operation
## Exit program with error if failure

let code = status # ensure that the input expression is evaluated once only

if code != CUDA_SUCCESS:
writeStackTrace()
stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
if quitOnFailure:
quit 1 # NOTE: this hides exceptions if they are thrown!

func cuModuleLoadData*(module: var CUmodule, sourceCode: openArray[char]): CUresult {.inline.}=
cuModuleLoadData(module, sourceCode[0].unsafeAddr)
func cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: openArray[char]): CUresult {.inline.}=
Expand Down Expand Up @@ -221,277 +209,6 @@ proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =
check cuMemFree(aGPU)
check cuMemFree(bGPU)

proc getTypes(n: NimNode): seq[NimNode] =
case n.kind
of nnkIdent, nnkSym: result.add getTypeInst(n)
of nnkLiterals: result.add getTypeInst(n)
of nnkBracket, nnkTupleConstr, nnkPar:
for el in n:
result.add getTypes(el)
else:
case n.typeKind
of ntyPtr: result.add getTypeInst(n)
else:
error("Arguments to `execCuda` must be given as a bracket, tuple or typed expression. Instead: " & $n.treerepr)

proc requiresCopy(n: NimNode): bool =
## Returns `true` if the given type is not a trivial data type, which implies
## it will require copying its value manually.
case n.typeKind
of ntyBool, ntyChar, ntyInt .. ntyUint64: # range includes all floats
result = false
else:
result = true

proc allowsCopy(n: NimNode): bool =
## Returns `true` if the given type is allowed to be copied. That means it is
## either `requiresCopy` or a `var` symbol.
result = n.requiresCopy or n.symKind == nskVar

proc getIdent(n: NimNode): NimNode =
## Generate a `GPU` suffixed ident
# Note: We want a deterministic name, because we call `getIdent` for the same symbol
# in multiple places atm.
case n.kind
of nnkIdent, nnkSym: result = ident(n.strVal & "GPU")
else: result = ident("`" & n.repr & "`GPU")

proc determineDevicePtrs(r, i: NimNode, iTypes: seq[NimNode]): seq[(NimNode, NimNode)] =
## Returns the device pointer ident and its associated original symbol.
for el in r:
if not el.allowsCopy:
error("The argument for `res`: " & $el.repr & " of type: " & $el.getTypeImpl().treerepr &
" does not allow copying. Copying to the address of all result variables is required.")
result.add (getIdent(el), el)
for idx in 0 ..< i.len:
let input = i[idx]
let t = iTypes[idx]
if t.requiresCopy():
result.add (getIdent(input), input)

proc assembleParams(r, i: NimNode, iTypes: seq[NimNode]): seq[NimNode] =
## Returns all parameters. Depending on whether they require copies or
## are `res` parameters, either the input parameter or the `GPU` parameter.
for el in r: # for `res` we always copy!
result.add getIdent(el)
for idx in 0 ..< i.len:
let input = i[idx]
let t = iTypes[idx]
if t.requiresCopy():
result.add getIdent(input)
else:
result.add input

proc sizeArg(n: NimNode): NimNode =
## The argument to `sizeof` must be the size of the data we copy. If the
## input type is already given as a `ptr T` type, we need the size of
## `T` and not `ptr`.
case n.typeKind
of ntyPtr: result = n.getTypeInst()[0]
else: result = n

# little helper macro constructors
template check(arg): untyped = nnkCall.newTree(ident"check", arg)
template size(arg): untyped = nnkCall.newTree(ident"sizeof", sizeArg arg)
template address(arg): untyped = nnkCall.newTree(ident"addr", arg)
template csize_t(arg): untyped = nnkCall.newTree(ident"csize_t", arg)
template pointer(arg): untyped = nnkCall.newTree(ident"pointer", arg)

proc maybeAddress(n: NimNode): NimNode =
## Returns the address of the given node, *IFF* the type is not a
## pointer type already
case n.typeKind
of ntyPtr: result = n
else: result = address(n)

proc genParams(pId, r, i: NimNode, iTypes: seq[NimNode]): NimNode =
## Generates the parameter `params` variable
let ps = assembleParams(r, i, iTypes)
result = nnkBracket.newTree()
for p in ps:
result.add pointer(maybeAddress p)
result = nnkLetSection.newTree(
nnkIdentDefs.newTree(pId, newEmptyNode(), result)
)

proc genVar(n: NimNode): (NimNode, NimNode) =
## Generates a let `tmp` variable and returns its identifier and
## the let section.
result[0] = genSym(nskLet, "tmp")
result[1] = nnkLetSection.newTree(
nnkIdentDefs.newTree(
result[0],
getTypeInst(n),
n
)
)

proc genLocalVars(inputs: NimNode): (NimNode, NimNode) =
result[0] = newStmtList() # defines local vars
result[1] = nnkBracket.newTree() # returns new bracket of vars for parameters
for el in inputs:
case el.kind
of nnkLiterals, nnkConstDef: # define a local with the value of it
let (s, v) = genVar(el)
result[0].add v
result[1].add s
of nnkSym:
if el.strVal in ["true", "false"]:
let (s, v) = genVar(el)
result[0].add v
result[1].add s
else:
result[1].add el # keep symbol
else:
result[1].add el # keep symbol

proc maybeWrap(n: NimNode): NimNode =
if n.kind notin {nnkBracket, nnkTupleConstr}:
result = nnkBracket.newTree(n)
else:
result = n

proc endianCheck(): NimNode =
result = quote do:
static: doAssert cpuEndian == littleEndian, block:
# From https://developer.nvidia.com/cuda-downloads?target_os=Linux
# Supported architectures for Cuda are:
# x86-64, PowerPC 64 little-endian, ARM64 (aarch64)
# which are all little-endian at word-level.
#
# Due to limbs being also stored in little-endian, on little-endian host
# the CPU and GPU will have the same binary representation
# whether we use 32-bit or 64-bit words, so naive memcpy can be used for parameter passing.

"Most CPUs (x86-64, ARM) are little-endian, as are Nvidia GPUs, which allows naive copying of parameters.\n" &
"Your architecture '" & $hostCPU & "' is big-endian and GPU offloading is unsupported on it."

proc execCudaImpl(jitFn, res, inputs: NimNode): NimNode =
# Maybe wrap individually given arguments in a `[]` bracket, e.g.
# `execCuda(res = foo, inputs = bar)`
let res = maybeWrap res
let inputs = maybeWrap inputs

result = newStmtList()
result.add endianCheck()

# get the types of the inputs
let rTypes = getTypes(res)
let iTypes = getTypes(inputs)

# determine all required `CUdeviceptr`
let devPtrs = determineDevicePtrs(res, inputs, iTypes)

# generate device pointers, allocate memory and copy data
for x in devPtrs:
# `var rGPU: CUdeviceptr`
result.add nnkVarSection.newTree(
nnkIdentDefs.newTree(
x[0],
ident"CUdeviceptr",
newEmptyNode()
)
)

# `check cuMemAlloc(rGPU, csize_t sizeof(r))`
result.add(
check nnkCall.newTree(
ident"cuMemAlloc",
x[0],
csize_t size(x[1])
)
)
# `check cuMemcpyHtoD(aGPU, a.addr, csize_t sizeof(a))`
result.add(
check nnkCall.newTree(
ident"cuMemcpyHtoD",
x[0],
maybeAddress x[1],
csize_t size(x[1])
)
)

# Generate local variables
let (decl, vars) = genLocalVars(inputs)
result.add decl

# assemble the parameters
let pId = ident"params"
let params = genParams(pId, res, vars, iTypes)
result.add params

# launch the kernel
result.add quote do:
check cuLaunchKernel(
`jitFn`,
1, 1, 1, # grid(x, y, z)
1, 1, 1, # block(x, y, z)
sharedMemBytes = 0,
CUstream(nil),
`pId`[0].unsafeAddr, nil)

# copy back results
let devPtrsRes = determineDevicePtrs(res, nnkBracket.newTree(), @[])
for x in devPtrsRes:
result.add(
check nnkCall.newTree(
ident"cuMemcpyDtoH",
maybeAddress x[1],
x[0],
csize_t size(x[1])
)
)

# free memory
for x in devPtrs:
result.add(
check nnkCall.newTree(
ident"cuMemFree",
x[0]
)
)
result = quote do:
block:
`result`

macro execCuda*(jitFn: CUfunction,
res: typed,
inputs: typed): untyped =
## Given a CUDA function, execute the kernel. Copies all non trivial data types to
## to the GPU via `cuMemcpyHtoD`. Any argument given as `res` will be copied back
## from the GPU after kernel execution finishes.
##
## IMPORTANT:
## The arguments passed to the CUDA kernel will be in the order in which they are
## given to the macro. This especially means `res` arguments will be passed first.
##
## Example:
## ```nim
## execCuda(fn, res = [r, s], inputs = [a, b, c]) # if all arguments have the same type
## # or
## execCuda(fn, res = (r, s), inputs = (a, b, c)) # if different types
## ```
## will pass the parameters as `[r, s, a, b, c]`.
##
## For more examples see the test case `tests/gpu/t_exec_literals_consts.nim`.
##
## We do not perform any checks on whether the given types are valid as arguments to
## the CUDA target! Also, all arguments given as `res` are expected to be copied.
## To return a value for a simple data type, use a `ptr X` type. However, it is allowed
## to simply pass a `var` symbol as a `res` argument. We automatically copy to the
## the memory location.
##
## We also copy all `res` data to the GPU, so that a return value can also be used
## as an input.
##
## NOTE: This function is mainly intended for convenient execution of a single kernel
result = execCudaImpl(jitFn, res, inputs)

macro execCuda*(jitFn: CUfunction,
res: typed): untyped =
## Overload of the above for empty `inputs`
result = execCudaImpl(jitFn, res, nnkBracket.newTree())

# ############################################################
#
# Compilation helper
Expand All @@ -516,6 +233,12 @@ type

NvidiaAssembler* = ref NvidiaAssemblerObj

## We define a distinct version of the `CUfunction` type to differentiate
## producing a kernel via the LLVM backend from the more direct approach
## using NVRTC. This is because the data passing for field elements
## is more complicated on the LLVM side (requires a manual copy).
CUfunctionLLVM* = distinct CUfunction

proc `=destroy`*(nv: NvidiaAssemblerObj) =
## XXX: Need to also call the finalizer for `asy` in the future!
# NOTE: In the destructor we don't want to quit on a `check` failure.
Expand Down Expand Up @@ -592,7 +315,7 @@ proc initNvAsm*[Name: static Algebra](field: type EC_ShortW_Jac[Fp[Name], G1], w
result.fd = result.cd.fd
result.asy.definePrimitives(result.cd)

proc compile*(nv: NvidiaAssembler, kernName: string): CUfunction =
proc compile*(nv: NvidiaAssembler, kernName: string): CUfunctionLLVM =
## Overload of `compile` below.
## Call this version if you have manually used the Assembler_LLVM object
## to build instructions and have a kernel name you wish to compile.
Expand All @@ -617,18 +340,32 @@ proc compile*(nv: NvidiaAssembler, kernName: string): CUfunction =
check cuModuleLoadData(nv.cuMod, ptx)
# will be cleaned up when `NvidiaAssembler` goes out of scope

result = nv.cuMod.getCudaKernel(kernName)
result = CUfunctionLLVM(nv.cuMod.getCudaKernel(kernName))

proc compile*(nv: NvidiaAssembler, fn: FieldFnGenerator): CUfunction =
proc compile*(nv: NvidiaAssembler, fn: FieldFnGenerator): CUfunctionLLVM =
## Given a function that generates code for a finite field operation, compile
## that function on the given Nvidia target and return a CUDA function.
# execute the `fn`
let kernName = nv.asy.fn(nv.fd)
result = nv.compile(kernName)
result = CUfunctionLLVM(nv.compile(kernName))

proc compile*(nv: NvidiaAssembler, fn: CurveFnGenerator): CUfunction =
proc compile*(nv: NvidiaAssembler, fn: CurveFnGenerator): CUfunctionLLVM =
## Given a function that generates code for an elliptic curve operation, compile
## that function on the given Nvidia target and return a CUDA function.
# execute the `fn`
let kernName = nv.asy.fn(nv.cd)
result = nv.compile(kernName)
result = CUfunctionLLVM(nv.compile(kernName))

import ./experimental/cuda_execute_dsl
macro execCuda*(jitFn: CUfunctionLLVM,
res: typed,
inputs: typed): untyped =
## See `execCuda` in `constantine/math_compiler/experimental/cuda_execute_dsl.nim`
## for an explanation.
##
## This LLVM overload makes sure we disallow passing simple structs
## via their pointer and instead always copy them (required due to our
## type definitions for finite field elements and elliptic curve points
## on the LLVM target).
execCudaImpl(jitFn, newLit 1, newLit 1, res, inputs,
passStructByPointer = false)
Loading
Loading