@@ -133,13 +133,17 @@ end
133
133
134
134
# # COV_EXCL_STOP
135
135
136
- NVTX. @range function GPUArrays. mapreducedim! (f, op, R:: CuArray{T} , As:: AbstractArray... ; init= nothing ) where T
137
- # TODO : Broadcast-semantics after JuliaLang-julia#31020
138
- A = first (As)
139
- all (B -> size (A) == size (B), As) || throw (DimensionMismatch (" dimensions of containers must be identical" ))
136
+ if VERSION < v " 1.5.0-DEV.748"
137
+ Base. axes (bc:: Base.Broadcast.Broadcasted{<:CuArrayStyle, <:NTuple{N}} ,
138
+ d:: Integer ) where N =
139
+ d <= N ? axes (bc)[d] : Base. OneTo (1 )
140
+ end
140
141
142
+ NVTX. @range function GPUArrays. mapreducedim! (f, op, R:: CuArray{T} ,
143
+ A:: Union{AbstractArray,Broadcast.Broadcasted} ;
144
+ init= nothing ) where T
141
145
Base. check_reducedims (R, A)
142
- isempty (A) && return R
146
+ length (A) == 0 && return R # isempty(::Broadcasted) iterates
143
147
144
148
f = cufunc (f)
145
149
op = cufunc (op)
@@ -156,8 +160,8 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
156
160
157
161
# iteration domain, split in two: one part covers the dimensions that should
158
162
# be reduced, and the other covers the rest. combining both covers all values.
159
- Rall = CartesianIndices (A )
160
- Rother = CartesianIndices (R )
163
+ Rall = CartesianIndices (axes (A) )
164
+ Rother = CartesianIndices (axes (R) )
161
165
Rreduce = CartesianIndices (ifelse .(axes (A) .== axes (R), Ref (Base. OneTo (1 )), axes (A)))
162
166
# NOTE: we hard-code `OneTo` (`first.(axes(A))` would work too) or we get a
163
167
# CartesianIndices object with UnitRanges that behave badly on the GPU.
@@ -187,7 +191,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
187
191
# we might not be able to launch all those threads to reduce each slice in one go.
188
192
# that's why each threads also loops across their inputs, processing multiple values
189
193
# so that we can span the entire reduction dimension using a single thread block.
190
- args = (f, op, init, Rreduce, Rother, Val (shuffle), R′, As ... )
194
+ args = (f, op, init, Rreduce, Rother, Val (shuffle), R′, A )
191
195
kernel_args = cudaconvert .(args)
192
196
kernel_tt = Tuple{Core. Typeof .(kernel_args)... }
193
197
kernel = cufunction (partial_mapreduce_grid, kernel_tt)
@@ -218,7 +222,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
218
222
if reduce_blocks == 1
219
223
# we can cover the dimensions to reduce using a single block
220
224
@cuda threads= threads blocks= blocks shmem= shmem partial_mapreduce_grid (
221
- f, op, init, Rreduce, Rother, Val (shuffle), R′, As ... )
225
+ f, op, init, Rreduce, Rother, Val (shuffle), R′, A )
222
226
else
223
227
# we need multiple steps to cover all values to reduce
224
228
partial = similar (R, (size (R)... , reduce_blocks))
@@ -232,7 +236,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
232
236
end
233
237
end
234
238
@cuda threads= threads blocks= blocks shmem= shmem partial_mapreduce_grid (
235
- f, op, init, Rreduce, Rother, Val (shuffle), partial, As ... )
239
+ f, op, init, Rreduce, Rother, Val (shuffle), partial, A )
236
240
237
241
GPUArrays. mapreducedim! (identity, op, R′, partial; init= init)
238
242
end
0 commit comments