Skip to content
This repository was archived by the owner on Mar 12, 2021. It is now read-only.

Commit 47a90a7

Browse files
committed
Figure out the block sizes without passing additional arguments.
1 parent 1b28b30 commit 47a90a7

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

src/mapreduce.jl

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,13 @@ Base.@propagate_inbounds _map_getindex(args::Tuple{}, I) = ()
8787
# Reduce an array across the grid. All elements to be processed can be addressed by the
8888
# product of the two iterators `Rreduce` and `Rother`, where the latter iterator will have
8989
# singleton entries for the dimensions that should be reduced (and vice versa).
90-
function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, gridDim_reduce, shuffle, R, As...)
90+
function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R, As...)
9191
# decompose the 1D hardware indices into separate ones for reduction (across threads
9292
# and possibly blocks if it doesn't fit) and other elements (remaining blocks)
9393
threadIdx_reduce = threadIdx().x
9494
blockDim_reduce = blockDim().x
95-
blockIdx_other, blockIdx_reduce = fldmod1(blockIdx().x, gridDim_reduce)
95+
blockIdx_reduce, blockIdx_other = fldmod1(blockIdx().x, length(Rother))
96+
gridDim_reduce = gridDim().x ÷ length(Rother)
9697

9798
# block-based indexing into the values outside of the reduction dimension
9899
# (that means we can safely synchronize threads within this block)
@@ -163,7 +164,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
163164
R′ = reshape(R, (size(R)..., 1))
164165

165166
# determine how many threads we can launch
166-
args = (f, op, init, Rreduce, Rother, 1, Val(shuffle), R′, As...)
167+
args = (f, op, init, Rreduce, Rother, Val(shuffle), R′, As...)
167168
kernel_args = cudaconvert.(args)
168169
kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
169170
kernel = cufunction(partial_mapreduce_grid, kernel_tt)
@@ -185,7 +186,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
185186
if reduce_blocks == 1
186187
# we can cover the dimensions to reduce using a single block
187188
@cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
188-
f, op, init, Rreduce, Rother, 1, Val(shuffle), R′, As...)
189+
f, op, init, Rreduce, Rother, Val(shuffle), R′, As...)
189190
else
190191
# we need multiple steps to cover all values to reduce
191192
partial = similar(R, (size(R)..., reduce_blocks))
@@ -199,7 +200,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
199200
end
200201
end
201202
@cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
202-
f, op, init, Rreduce, Rother, reduce_blocks, Val(shuffle), partial, As...)
203+
f, op, init, Rreduce, Rother, Val(shuffle), partial, As...)
203204

204205
GPUArrays.mapreducedim!(identity, op, R′, partial; init=init)
205206
end

0 commit comments

Comments
 (0)