review

wongalvis14 · web-flow · commit fec501afc5d0 · 2020-03-25T00:15:19.000-07:00
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -110,7 +110,7 @@ function partial_mapreduce_grid(f, op, A, R, neutral, Rreduce, Rother, gridDim_r
         
         val = op(neutral, neutral)
 
-        # get a value that should be reduced
+        # reduce serially across chunks of input vector that don't fit in a block
         ireduce = threadIdx_reduce + (blockIdx_reduce - 1) * blockDim_reduce
         while ireduce <= length(Rreduce)
             Ireduce = Rreduce[ireduce]
@@ -142,8 +142,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, A::AbstractAr
     # be conservative about using shuffle instructions
     shuffle = true
     shuffle &= capability(device()) >= v"3.0"
-    shuffle &= T in (Int32, Int64, Float32, Float64, ComplexF32, ComplexF64)
-    # TODO: add support for Bool (CUDAnative.jl#420)
+    shuffle &= T in (Bool, Int32, Int64, Float32, Float64, ComplexF32, ComplexF64)
 
     # iteration domain, split in two: one part covers the dimensions that should
     # be reduced, and the other covers the rest. combining both covers all values.