Merge pull request #642 from JuliaGPU/tb/mapreduce

maleadt · web-flow · commit bc03269ed397 · 2020-03-20T13:07:53.000+01:00
Avoid multiple mapreduce kernel launches
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -80,49 +80,40 @@ end
     return val
 end
 
-# Partially reduce an array across the grid. The reduction is partial, with multiple
-# blocks `gridDim_reduce` working on reducing data from `A` and writing it to multiple
-# outputs in `R`. All elements to be processed can be addressed by the product of the
-# two iterators `Rreduce` and `Rother`, where the latter iterator will have singleton
-# entries for the dimensions that should be reduced (and vice versa). The output array
-# is expected to have an additional dimension with as size the number of reduced values
-# for every reduction (i.e. more than one if there's multiple blocks participating).
-function partial_mapreduce_grid(f, op, A, R, neutral, Rreduce, Rother, gridDim_reduce, shuffle)
-    # decompose the 1D hardware indices into separate ones for reduction (across threads
-    # and possibly blocks if it doesn't fit) and other elements (remaining blocks)
-    threadIdx_reduce = threadIdx().x
-    blockDim_reduce = blockDim().x
-    blockIdx_other, blockIdx_reduce = fldmod1(blockIdx().x, gridDim_reduce)
-
+# Reduce an array across the grid. All elements to be processed can be addressed by the
+# product of the two iterators `Rreduce` and `Rother`, where the latter iterator will have
+# singleton entries for the dimensions that should be reduced (and vice versa).
+function mapreduce_grid(f, op, A, R, neutral, Rreduce, Rother, shuffle)
     # block-based indexing into the values outside of the reduction dimension
     # (that means we can safely synchronize threads within this block)
-    iother = blockIdx_other
+    iother = blockIdx().x
     @inbounds if iother <= length(Rother)
         Iother = Rother[iother]
 
         # load the neutral value
-        Iout = CartesianIndex(Tuple(Iother)..., blockIdx_reduce)
+        Iout = Iother
         neutral = if neutral === nothing
             R[Iout]
         else
             neutral
         end
 
-        # get a value that should be reduced
-        ireduce = threadIdx_reduce + (blockIdx_reduce - 1) * blockDim_reduce
-        val = if ireduce <= length(Rreduce)
+        val = op(neutral, neutral)
+
+        # reduce serially across chunks of input vector that don't fit in a block
+        ireduce = threadIdx().x
+        while ireduce <= length(Rreduce)
             Ireduce = Rreduce[ireduce]
             J = max(Iother, Ireduce)
-            f(A[J])
-        else
-            neutral
+            val = op(val, f(A[J]))
+            ireduce += blockDim().x
         end
-        val = op(val, neutral)
 
+        # reduce in parallel within the current block
         val = reduce_block(op, val, neutral, shuffle)
 
         # write back to memory
-        if threadIdx_reduce == 1
+        if threadIdx().x == 1
             R[Iout] = val
         end
     end
@@ -142,8 +133,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, A::AbstractAr
     # be conservative about using shuffle instructions
     shuffle = true
     shuffle &= capability(device()) >= v"3.0"
-    shuffle &= T in (Int32, Int64, Float32, Float64, ComplexF32, ComplexF64)
-    # TODO: add support for Bool (CUDAnative.jl#420)
+    shuffle &= T in (Bool, Int32, Int64, Float32, Float64, ComplexF32, ComplexF64)
 
     # iteration domain, split in two: one part covers the dimensions that should
     # be reduced, and the other covers the rest. combining both covers all values.
@@ -154,52 +144,21 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, A::AbstractAr
     #       CartesianIndices object with UnitRanges that behave badly on the GPU.
     @assert length(Rall) == length(Rother) * length(Rreduce)
 
-    # allocate an additional, empty dimension to write the reduced value to.
-    # this does not affect the actual location in memory of the final values,
-    # but allows us to write a generalized kernel supporting partial reductions.
-    R′ = reshape(R, (size(R)..., 1))
-
-    # determine how many threads we can launch
-    args = (f, op, A, R′, init, Rreduce, Rother, 1, Val(shuffle))
-    kernel_args = cudaconvert.(args)
-    kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
-    kernel = cufunction(partial_mapreduce_grid, kernel_tt)
-    kernel_config =
-        launch_configuration(kernel.fun; shmem = shuffle ? 0 : threads->2*threads*sizeof(T))
-
-    # determine the launch configuration
-    dev = device()
-    reduce_threads = shuffle ? nextwarp(dev, length(Rreduce)) : nextpow(2, length(Rreduce))
-    if reduce_threads > kernel_config.threads
-        reduce_threads = shuffle ? prevwarp(dev, kernel_config.threads) : prevpow(2, kernel_config.threads)
-    end
-    reduce_blocks = cld(length(Rreduce), reduce_threads)
-    other_blocks = length(Rother)
-    threads, blocks = reduce_threads, reduce_blocks*other_blocks
-    shmem = shuffle ? 0 : 2*threads*sizeof(T)
-
-    # perform the actual reduction
-    if reduce_blocks == 1
-        # we can cover the dimensions to reduce using a single block
-        @cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
-            f, op, A, R′, init, Rreduce, Rother, 1, Val(shuffle))
-    else
-        # we need multiple steps to cover all values to reduce
-        partial = similar(R, (size(R)..., reduce_blocks))
-        if init === nothing
-            # without an explicit initializer we need to copy from the output container
-            sz = prod(size(R))
-            for i in 1:reduce_blocks
-                # TODO: async copies (or async fill!, but then we'd need to load first)
-                #       or maybe just broadcast since that extends singleton dimensions
-                copyto!(partial, (i-1)*sz+1, R, 1, sz)
-            end
+    function configurator(kernel)
+        config = launch_configuration(kernel.fun)
+        dev = device()
+
+        threads = shuffle ? nextwarp(dev, length(Rreduce)) : nextpow(2, length(Rreduce))
+        if threads > config.threads
+            threads = shuffle ? prevwarp(dev, config.threads) : prevpow(2, config.threads)
         end
-        @cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
-            f, op, A, partial, init, Rreduce, Rother, reduce_blocks, Val(shuffle))
+        blocks = length(Rother)
+        shmem = shuffle ? 0 : 2*threads*sizeof(T)
 
-        GPUArrays.mapreducedim!(identity, op, R′, partial, init)
+        return (threads=threads, blocks=blocks, shmem=shmem)
     end
 
+    @cuda config=configurator mapreduce_grid(f, op, A, R, init, Rreduce, Rother, Val(shuffle))
+
     return R
 end