Merge pull request #514 from JuliaGPU/tb/docs

maleadt · web-flow · commit c70bb394ecec · 2020-10-29T12:58:46.000+01:00
Add some docs
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
@@ -18,10 +18,16 @@ uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 version = "0.8.3"
 
 [[Documenter]]
-deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
-git-tree-sha1 = "fb1ff838470573adc15c71ba79f8d31328f035da"
+deps = ["Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
+git-tree-sha1 = "71e35e069daa9969b8af06cef595a1add76e0a11"
 uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-version = "0.25.2"
+version = "0.25.3"
+
+[[IOCapture]]
+deps = ["Logging"]
+git-tree-sha1 = "377252859f740c217b936cebcd918a44f9b53b59"
+uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
+version = "0.1.1"
 
 [[InteractiveUtils]]
 deps = ["Markdown"]
@@ -42,9 +48,9 @@ uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 
 [[Literate]]
 deps = ["Base64", "JSON", "REPL"]
-git-tree-sha1 = "0ee3b052b944e1a84b6eb0ca15ce3899718df599"
+git-tree-sha1 = "7f289e9db7a93d30b9a44af4a8ae9cf92af74683"
 uuid = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
-version = "2.6.0"
+version = "2.7.0"
 
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
@@ -57,10 +63,10 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[Parsers]]
-deps = ["Dates", "Test"]
-git-tree-sha1 = "8077624b3c450b15c087944363606a6ba12f925e"
+deps = ["Dates"]
+git-tree-sha1 = "6fa4202675c05ba0f8268a6ddf07606350eda3ce"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "1.0.10"
+version = "1.0.11"
 
 [[Pkg]]
 deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
diff --git a/docs/make.jl b/docs/make.jl
@@ -46,6 +46,7 @@ function main()
             ],
             "Development" => Any[
                 "development/profiling.md",
+                "development/troubleshooting.md",
             ],
             "API reference" => Any[
                 "api/essentials.md",
diff --git a/docs/src/development/troubleshooting.md b/docs/src/development/troubleshooting.md
@@ -0,0 +1,175 @@
+# Troubleshooting
+
+To increase logging verbosity of the CUDA.jl compiler, launch Julia with the `JULIA_DEBUG`
+environment variable set to `CUDA`.
+
+
+## InvalidIRError: compiling ... resulted in invalid LLVM IR
+
+Not all of Julia is supported by CUDA.jl. Several commonly-used features, like strings or
+exceptions, will not compile to GPU code, because of their interactions with the CPU-only
+runtime library.
+
+For example, say we define and try to execute the following kernel:
+
+```julia
+julia> function kernel(a)
+         @inbounds a[threadId().x] = 0
+         return
+       end
+
+julia> @cuda kernel(CuArray([1]))
+ERROR: InvalidIRError: compiling kernel kernel(CuDeviceArray{Int64,1,1}) resulted in invalid LLVM IR
+Reason: unsupported dynamic function invocation (call to setindex!)
+Stacktrace:
+ [1] kernel at REPL[2]:2
+Reason: unsupported dynamic function invocation (call to getproperty)
+Stacktrace:
+ [1] kernel at REPL[2]:2
+Reason: unsupported use of an undefined name (use of 'threadId')
+Stacktrace:
+ [1] kernel at REPL[2]:2
+```
+
+CUDA.jl does its best to decode the unsupported IR and figure out where it came from. In
+this case, there's two so-called dynamic invocations, which happen when a function call
+cannot be statically resolved (often because the compiler could not fully infer the call,
+e.g., due to inaccurate or instable type information). These are a red herring, and the real
+cause is listed last: a typo in the use of the `threadIdx` function! If we fix this, the IR
+error disappears and our kernel successfully compiles and executes.
+
+
+## KernelError: kernel returns a value of type `Union{}`
+
+Where the previous section clearly pointed to the source of invalid IR, in other cases your
+function will return an error. This is encoded by the Julia compiler as a return value of
+type `Union{}`:
+
+```julia
+julia> function kernel(a)
+         @inbounds a[threadId().x] = CUDA.sin(a[threadIdx().x])
+         return
+       end
+
+julia> @cuda kernel(CuArray([1]))
+ERROR: GPU compilation of kernel kernel(CuDeviceArray{Int64,1,1}) failed
+KernelError: kernel returns a value of type `Union{}`
+```
+
+Now we don't know where this error came from, and we will have to take a look ourselves at
+the generated code. This is easily done using the `@device_code` introspection macros, which
+mimic their Base counterparts (e.g. `@device_code_llvm` instead of `@code_llvm`, etc).
+
+To debug an error returned by a kernel, we should use `@device_code_warntype` to inspect the
+Julia IR. Furthermore, this macro has an `interactive` mode, which further facilitates
+inspecting this IR using Cthulhu.jl. First, install and import this package, and then try to
+execute the kernel again prefixed by `@device_code_warntype interactive=true`:
+
+```julia
+julia> using Cthulhu
+
+julia> @device_code_warntype interactive=true @cuda kernel(CuArray([1]))
+Variables
+  #self#::Core.Compiler.Const(kernel, false)
+  a::CuDeviceArray{Int64,1,1}
+  val::Union{}
+
+Body::Union{}
+1 ─ %1  = CUDA.sin::Core.Compiler.Const(CUDA.sin, false)
+│   ...
+│   %14 = (...)::Int64
+└──       goto #2
+2 ─       (%1)(%14)
+└──       $(Expr(:unreachable))
+
+Select a call to descend into or ↩ to ascend.
+ • %17  = call CUDA.sin(::Int64)::Union{}
+```
+
+Both from the IR and the list of calls Cthulhu offers to inspect further, we can see that
+the call to `CUDA.sin(::Int64)` results in an error: in the IR it is immediately followed by
+an `unreachable`, while in the list of calls it is inferred to return `Union{}`. Now we know
+where to look, it's easy to figure out what's wrong:
+
+```julia
+help?> CUDA.sin
+  # 2 methods for generic function "sin":
+  [1] sin(x::Float32) in CUDA at /home/tim/Julia/pkg/CUDA/src/device/intrinsics/math.jl:13
+  [2] sin(x::Float64) in CUDA at /home/tim/Julia/pkg/CUDA/src/device/intrinsics/math.jl:12
+```
+
+There's no method of `CUDA.sin` that accepts an Int64, and thus the function was determined
+to unconditionally throw a method error. For now, we disallow these situations and refuse to
+compile, but in the spirit of dynamic languages we might change this behavior to just throw
+an error at run time.
+
+
+## Debug info and line-number information
+
+On Julia debug level 1, which is the default setting if unspecified, CUDA.jl emits line
+number information corresponding to `nvcc -lineinfo`. This information does not hurt
+performance, and is used by a variety of tools to improve the debugging experience.
+
+To emit actual debug info as `nvcc -G` does, you need to start Julia on debug level 2 by
+passing the flag `-g2`. Support for emitting PTX-compatible debug info is a recent addition
+to the NVPTX LLVM back-end, so it's possible this information is incorrect or otherwise
+affects compilation.
+
+ !!! warning
+
+     Due to bugs in LLVM and/or CUDA, the debug info as emitted by LLVM 8.0 or higher
+     results in crashed when loading the compiled code. As a result, all types of debug info
+     are disabled by CUDA.jl on Julia 1.4 or above. If you need line number information, you
+     need to revert to using Julia 1.3 which uses LLVM 6.0 (note that actual debug info is
+     not supported by LLVM 6.0).
+
+To disable all debug info emission, start Julia with the flag `-g0`.
+
+
+## Stack trace information
+
+The Julia debug level is also used to emit determine how much backtrace information to embed
+in the module. This information is used when displaying exceptions on the device, e.g., when
+going out of bounds:
+
+```julia
+julia> function kernel(a)
+         a[threadIdx().x] = 0
+         return
+       end
+kernel (generic function with 1 method)
+
+julia> @cuda threads=2 kernel(CuArray([1]))
+```
+
+On the default debug level of 1, an simple error message will be displayed:
+
+```
+ERROR: a exception was thrown during kernel execution.
+Run Julia on debug level 2 for device stack traces.
+```
+
+If we set the debug level to 2, by passing `-g2` to `julia`, we see:
+
+```
+ERROR: a exception was thrown during kernel execution.
+Stacktrace:
+ [1] throw_boundserror at abstractarray.jl:541
+ [2] checkbounds at abstractarray.jl:506
+ [3] arrayset at /home/tim/Julia/pkg/CUDA/src/device/array.jl:84
+ [4] setindex! at /home/tim/Julia/pkg/CUDA/src/device/array.jl:101
+ [5] kernel at REPL[4]:2
+```
+
+Note that these messages are embedded in the module (CUDA does not support stack unwinding),
+and thus bloat its size. To avoid any overhead, you can disable these messages by setting
+the debug level to 0 (passing `-g0` to `julia`). This disabled any device-side message, but
+retains the host-side detection:
+
+```
+julia> @cuda threads=2 kernel(CuArray([1]))
+# no device-side error message!
+
+julia> synchronize()
+ERROR: KernelException: exception thrown during kernel execution
+```
diff --git a/docs/src/faq.md b/docs/src/faq.md
@@ -3,6 +3,48 @@
 This page is a compilation of frequently asked questions and answers.
 
 
+## An old version of CUDA.jl keeps getting installed!
+
+Sometimes it happens that a breaking version of CUDA.jl or one of its dependencies is
+released. If any package you use isn't yet compatible with this release, this will block
+automatic upgrade of CUDA.jl. For example, with Flux.jl v0.11.1 we get CUDA.jl v1.3.3
+despite there being a v2.x release:
+
+```
+pkg> add Flux
+  [587475ba] + Flux v0.11.1
+pkg> add CUDA
+  [052768ef] + CUDA v1.3.3
+```
+
+To examine which package is holding back CUDA.jl, you can "force" an upgrade by specifically
+requesting a newer version. The resolver will then complain, and explain why this upgrade
+isn't possible:
+
+```
+pkg> add CUDA.jl@2
+  Resolving package versions...
+ERROR: Unsatisfiable requirements detected for package Adapt [79e6a3ab]:
+ Adapt [79e6a3ab] log:
+ ├─possible versions are: [0.3.0-0.3.1, 0.4.0-0.4.2, 1.0.0-1.0.1, 1.1.0, 2.0.0-2.0.2, 2.1.0, 2.2.0, 2.3.0] or uninstalled
+ ├─restricted by compatibility requirements with CUDA [052768ef] to versions: [2.2.0, 2.3.0]
+ │ └─CUDA [052768ef] log:
+ │   ├─possible versions are: [0.1.0, 1.0.0-1.0.2, 1.1.0, 1.2.0-1.2.1, 1.3.0-1.3.3, 2.0.0-2.0.2] or uninstalled
+ │   └─restricted to versions 2 by an explicit requirement, leaving only versions 2.0.0-2.0.2
+ └─restricted by compatibility requirements with Flux [587475ba] to versions: [0.3.0-0.3.1, 0.4.0-0.4.2, 1.0.0-1.0.1, 1.1.0] — no versions left
+   └─Flux [587475ba] log:
+     ├─possible versions are: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4, 0.11.0-0.11.1] or uninstalled
+     ├─restricted to versions * by an explicit requirement, leaving only versions [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4, 0.11.0-0.11.1]
+     └─restricted by compatibility requirements with CUDA [052768ef] to versions: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4] or uninstalled, leaving only versions: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4]
+       └─CUDA [052768ef] log: see above
+```
+
+A common source of these incompatibilities is having both CUDA.jl and the older
+CUDAnative.jl/CuArrays.jl/CUDAdrv.jl stack installed: These are incompatible, and cannot
+coexist. You can inspect in the Pkg REPL which exact packages you have installed using the
+`status --manifest` option.
+
+
 ## Can you wrap this or that CUDA API?
 
 If a certain API isn't wrapped with some high-level functionality, you can always use the
diff --git a/docs/src/installation/overview.md b/docs/src/installation/overview.md
@@ -8,6 +8,47 @@ using the artifact subsystem.
 
 
 
+## Package installation
+
+For most users, installing the latest tagged version of CUDA.jl will be sufficient. You can
+easily do that using the package manager:
+
+```
+pkg> add CUDA
+```
+
+Or, equivalently, via the `Pkg` API:
+
+```julia
+julia> import Pkg; Pkg.add("CUDA")
+```
+
+In some cases, you might need to use the `master` version of this package, e.g., because it
+includes a specific fix you need. Often, however, the development version of this package
+itself relies on unreleased versions of other packages. This information is recorded in the
+manifest at the root of the repository, which you can use by starting Julia from the CUDA.jl
+directory with the `--project` flag:
+
+```
+$ cd .julia/dev/CUDA.jl     # or wherever you have CUDA.jl checked out
+$ julia --project
+pkg> instantiate            # to install correct dependencies
+julia> using CUDA
+```
+
+In the case you want to use the development version of CUDA.jl with other packages, you
+cannot use the manifest and you need to manually install those dependencies from the master
+branch. Again, the exact requirements are recorded in CUDA.jl's manifest, but often the
+following instructions will work:
+
+```
+pkg> add GPUCompiler#master
+pkg> add GPUArrays#master
+pkg> add LLVM#master
+```
+
+
+
 ## Platform support
 
 All three major operation systems are supported: Linux, Windows and macOS. However, that
diff --git a/docs/src/installation/troubleshooting.md b/docs/src/installation/troubleshooting.md
@@ -21,12 +21,14 @@ Generally though, it's impossible to say what's the reason for the error, but Ju
 likely not to blame. Make sure your set-up works (e.g., try executing `nvidia-smi`, a CUDA C
 binary, etc), and if everything looks good file an issue.
 
+
 ## NVML library not found (on Windows)
 
 Check and make sure the `NVSMI` folder is in your `PATH`. By default it may not be. Look in
 `C:\Program Files\NVIDIA Corporation` for the `NVSMI` folder - you should see `nvml.dll`
 within it. You can add this folder to your `PATH` and check that `nvidia-smi` runs properly.
 
+
 ## LLVM error: Cannot cast between two non-generic address spaces
 
 You are using an unpatched copy of LLVM, likely caused by using Julia as packaged by your
@@ -37,3 +39,9 @@ extensive list of patches to be applied to the specific versions of LLVM that ar
 
 It is thus recommended to use the official binaries, or use a version of Julia built without
 setting `USE_SYSTEM_LLVM=1` (which you can suggest to maintainers of your Linux distribution).
+
+
+## LoadError: UndefVarError: AddrSpacePtr not defined
+
+You are using an old version of CUDA.jl in combination with a recent version of Julia
+(1.5+). This is not supported, and you should be using CUDA.jl 1.x or above.
diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
@@ -326,8 +326,8 @@ julia> y = CUDA.rand(2)
  0.03902049
  0.9689629
 
-julia> CUBLAS.dot(2, x, 0, y, 0)
-0.057767443f0
+julia> CUBLAS.dot(2, x, y)
+0.92129254f0
 
 julia> using LinearAlgebra
 
diff --git a/lib/cudadrv/error.jl b/lib/cudadrv/error.jl
@@ -26,18 +26,12 @@ Base.:(==)(x::CuError,y::CuError) = x.code == y.code
 
 Gets the string representation of an error code.
 
-This name can often be used as a symbol in source code to get an instance of this error.
-For example:
-
 ```jldoctest
-julia> err = CuError(1)
-CuError(1, ERROR_INVALID_VALUE)
+julia> err = CuError(CUDA.cudaError_enum(1))
+CuError(CUDA_ERROR_INVALID_VALUE)
 
 julia> name(err)
 "ERROR_INVALID_VALUE"
-
-julia> ERROR_INVALID_VALUE
-CuError(1, ERROR_INVALID_VALUE)
 ```
 """
 function name(err::CuError)
diff --git a/src/device/intrinsics/wmma.jl b/src/device/intrinsics/wmma.jl
@@ -379,8 +379,8 @@ All WMMA operations take a `Config` as their final argument.
 
 # Examples
 ```jldoctest
-julia> config = Config{16, 16, 16, Float32}
-Config{16,16,16,Float32}
+julia> config = WMMA.Config{16, 16, 16, Float32}
+CUDA.WMMA.Config{16,16,16,Float32}
 ```
 """
 struct Config{M, N, K, d_type} end