@@ -87,12 +87,13 @@ Base.@propagate_inbounds _map_getindex(args::Tuple{}, I) = ()
87
87
# Reduce an array across the grid. All elements to be processed can be addressed by the
88
88
# product of the two iterators `Rreduce` and `Rother`, where the latter iterator will have
89
89
# singleton entries for the dimensions that should be reduced (and vice versa).
90
- function partial_mapreduce_grid (f, op, neutral, Rreduce, Rother, gridDim_reduce, shuffle, R, As... )
90
+ function partial_mapreduce_grid (f, op, neutral, Rreduce, Rother, shuffle, R, As... )
91
91
# decompose the 1D hardware indices into separate ones for reduction (across threads
92
92
# and possibly blocks if it doesn't fit) and other elements (remaining blocks)
93
93
threadIdx_reduce = threadIdx (). x
94
94
blockDim_reduce = blockDim (). x
95
- blockIdx_other, blockIdx_reduce = fldmod1 (blockIdx (). x, gridDim_reduce)
95
+ blockIdx_reduce, blockIdx_other = fldmod1 (blockIdx (). x, length (Rother))
96
+ gridDim_reduce = gridDim (). x ÷ length (Rother)
96
97
97
98
# block-based indexing into the values outside of the reduction dimension
98
99
# (that means we can safely synchronize threads within this block)
@@ -163,7 +164,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
163
164
R′ = reshape (R, (size (R)... , 1 ))
164
165
165
166
# determine how many threads we can launch
166
- args = (f, op, init, Rreduce, Rother, 1 , Val (shuffle), R′, As... )
167
+ args = (f, op, init, Rreduce, Rother, Val (shuffle), R′, As... )
167
168
kernel_args = cudaconvert .(args)
168
169
kernel_tt = Tuple{Core. Typeof .(kernel_args)... }
169
170
kernel = cufunction (partial_mapreduce_grid, kernel_tt)
@@ -185,7 +186,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
185
186
if reduce_blocks == 1
186
187
# we can cover the dimensions to reduce using a single block
187
188
@cuda threads= threads blocks= blocks shmem= shmem partial_mapreduce_grid (
188
- f, op, init, Rreduce, Rother, 1 , Val (shuffle), R′, As... )
189
+ f, op, init, Rreduce, Rother, Val (shuffle), R′, As... )
189
190
else
190
191
# we need multiple steps to cover all values to reduce
191
192
partial = similar (R, (size (R)... , reduce_blocks))
@@ -199,7 +200,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
199
200
end
200
201
end
201
202
@cuda threads= threads blocks= blocks shmem= shmem partial_mapreduce_grid (
202
- f, op, init, Rreduce, Rother, reduce_blocks, Val (shuffle), partial, As... )
203
+ f, op, init, Rreduce, Rother, Val (shuffle), partial, As... )
203
204
204
205
GPUArrays. mapreducedim! (identity, op, R′, partial; init= init)
205
206
end
0 commit comments