Implement two-staged serial+parallel mapreduce

wongalvis14 · wongalvis14 · commit 2d0c06ae2ab9 · 2020-03-24T21:56:05.000-04:00
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -80,40 +80,49 @@ end
     return val
 end
 
-# Reduce an array across the grid. All elements to be processed can be addressed by the
-# product of the two iterators `Rreduce` and `Rother`, where the latter iterator will have
-# singleton entries for the dimensions that should be reduced (and vice versa).
-function mapreduce_grid(f, op, A, R, neutral, Rreduce, Rother, shuffle)
+# Partially reduce an array across the grid. The reduction is partial, with multiple
+# blocks `gridDim_reduce` working on reducing data from `A` and writing it to multiple
+# outputs in `R`. All elements to be processed can be addressed by the product of the
+# two iterators `Rreduce` and `Rother`, where the latter iterator will have singleton
+# entries for the dimensions that should be reduced (and vice versa). The output array
+# is expected to have an additional dimension with as size the number of reduced values
+# for every reduction (i.e. more than one if there's multiple blocks participating).
+function partial_mapreduce_grid(f, op, A, R, neutral, Rreduce, Rother, gridDim_reduce, shuffle)
+    # decompose the 1D hardware indices into separate ones for reduction (across threads
+    # and possibly blocks if it doesn't fit) and other elements (remaining blocks)
+    threadIdx_reduce = threadIdx().x
+    blockDim_reduce = blockDim().x
+    blockIdx_other, blockIdx_reduce = fldmod1(blockIdx().x, gridDim_reduce)
+
     # block-based indexing into the values outside of the reduction dimension
     # (that means we can safely synchronize threads within this block)
-    iother = blockIdx().x
+    iother = blockIdx_other
     @inbounds if iother <= length(Rother)
         Iother = Rother[iother]
 
         # load the neutral value
-        Iout = Iother
+        Iout = CartesianIndex(Tuple(Iother)..., blockIdx_reduce)
         neutral = if neutral === nothing
             R[Iout]
         else
             neutral
         end
-
+        
         val = op(neutral, neutral)
 
-        # reduce serially across chunks of input vector that don't fit in a block
-        ireduce = threadIdx().x
+        # get a value that should be reduced
+        ireduce = threadIdx_reduce + (blockIdx_reduce - 1) * blockDim_reduce
         while ireduce <= length(Rreduce)
             Ireduce = Rreduce[ireduce]
             J = max(Iother, Ireduce)
             val = op(val, f(A[J]))
-            ireduce += blockDim().x
+            ireduce += blockDim_reduce * gridDim_reduce
         end
 
-        # reduce in parallel within the current block
         val = reduce_block(op, val, neutral, shuffle)
 
         # write back to memory
-        if threadIdx().x == 1
+        if threadIdx_reduce == 1
             R[Iout] = val
         end
     end
@@ -133,7 +142,8 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, A::AbstractAr
     # be conservative about using shuffle instructions
     shuffle = true
     shuffle &= capability(device()) >= v"3.0"
-    shuffle &= T in (Bool, Int32, Int64, Float32, Float64, ComplexF32, ComplexF64)
+    shuffle &= T in (Int32, Int64, Float32, Float64, ComplexF32, ComplexF64)
+    # TODO: add support for Bool (CUDAnative.jl#420)
 
     # iteration domain, split in two: one part covers the dimensions that should
     # be reduced, and the other covers the rest. combining both covers all values.
@@ -144,21 +154,52 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, A::AbstractAr
     #       CartesianIndices object with UnitRanges that behave badly on the GPU.
     @assert length(Rall) == length(Rother) * length(Rreduce)
 
-    function configurator(kernel)
-        config = launch_configuration(kernel.fun)
-        dev = device()
-
-        threads = shuffle ? nextwarp(dev, length(Rreduce)) : nextpow(2, length(Rreduce))
-        if threads > config.threads
-            threads = shuffle ? prevwarp(dev, config.threads) : prevpow(2, config.threads)
+    # allocate an additional, empty dimension to write the reduced value to.
+    # this does not affect the actual location in memory of the final values,
+    # but allows us to write a generalized kernel supporting partial reductions.
+    R′ = reshape(R, (size(R)..., 1))
+
+    # determine how many threads we can launch
+    args = (f, op, A, R′, init, Rreduce, Rother, 1, Val(shuffle))
+    kernel_args = cudaconvert.(args)
+    kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
+    kernel = cufunction(partial_mapreduce_grid, kernel_tt)
+    kernel_config =
+        launch_configuration(kernel.fun; shmem = shuffle ? 0 : threads->2*threads*sizeof(T))
+
+    # determine the launch configuration
+    dev = device()
+    reduce_threads = shuffle ? nextwarp(dev, length(Rreduce)) : nextpow(2, length(Rreduce))
+    if reduce_threads > kernel_config.threads
+        reduce_threads = shuffle ? prevwarp(dev, kernel_config.threads) : prevpow(2, kernel_config.threads)
+    end
+    reduce_blocks = min(reduce_threads, cld(length(Rreduce), reduce_threads))
+    other_blocks = length(Rother)
+    threads, blocks = reduce_threads, reduce_blocks*other_blocks
+    shmem = shuffle ? 0 : 2*threads*sizeof(T)
+
+    # perform the actual reduction
+    if reduce_blocks == 1
+        # we can cover the dimensions to reduce using a single block
+        @cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
+            f, op, A, R′, init, Rreduce, Rother, 1, Val(shuffle))
+    else
+        # we need multiple steps to cover all values to reduce
+        partial = similar(R, (size(R)..., reduce_blocks))
+        if init === nothing
+            # without an explicit initializer we need to copy from the output container
+            sz = prod(size(R))
+            for i in 1:reduce_blocks
+                # TODO: async copies (or async fill!, but then we'd need to load first)
+                #       or maybe just broadcast since that extends singleton dimensions
+                copyto!(partial, (i-1)*sz+1, R, 1, sz)
+            end
         end
-        blocks = length(Rother)
-        shmem = shuffle ? 0 : 2*threads*sizeof(T)
+        @cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
+            f, op, A, partial, init, Rreduce, Rother, reduce_blocks, Val(shuffle))
 
-        return (threads=threads, blocks=blocks, shmem=shmem)
+        GPUArrays.mapreducedim!(identity, op, R′, partial, init)
     end
 
-    @cuda config=configurator mapreduce_grid(f, op, A, R, init, Rreduce, Rother, Val(shuffle))
-
     return R
 end