80
80
return val
81
81
end
82
82
83
- # Partially reduce an array across the grid. The reduction is partial, with multiple
84
- # blocks `gridDim_reduce` working on reducing data from `A` and writing it to multiple
85
- # outputs in `R`. All elements to be processed can be addressed by the product of the
86
- # two iterators `Rreduce` and `Rother`, where the latter iterator will have singleton
87
- # entries for the dimensions that should be reduced (and vice versa). The output array
88
- # is expected to have an additional dimension with as size the number of reduced values
89
- # for every reduction (i.e. more than one if there's multiple blocks participating).
90
- function partial_mapreduce_grid (f, op, A, R, neutral, Rreduce, Rother, gridDim_reduce, shuffle)
91
- # decompose the 1D hardware indices into separate ones for reduction (across threads
92
- # and possibly blocks if it doesn't fit) and other elements (remaining blocks)
93
- threadIdx_reduce = threadIdx (). x
94
- blockDim_reduce = blockDim (). x
95
- blockIdx_other, blockIdx_reduce = fldmod1 (blockIdx (). x, gridDim_reduce)
96
-
83
+ # Reduce an array across the grid. All elements to be processed can be addressed by the
84
+ # product of the two iterators `Rreduce` and `Rother`, where the latter iterator will have
85
+ # singleton entries for the dimensions that should be reduced (and vice versa).
86
+ function mapreduce_grid (f, op, A, R, neutral, Rreduce, Rother, shuffle)
97
87
# block-based indexing into the values outside of the reduction dimension
98
88
# (that means we can safely synchronize threads within this block)
99
- iother = blockIdx_other
89
+ iother = blockIdx () . x
100
90
@inbounds if iother <= length (Rother)
101
91
Iother = Rother[iother]
102
92
103
93
# load the neutral value
104
- Iout = CartesianIndex ( Tuple ( Iother) ... , blockIdx_reduce)
94
+ Iout = Iother
105
95
neutral = if neutral === nothing
106
96
R[Iout]
107
97
else
108
98
neutral
109
99
end
110
100
111
- # get a value that should be reduced
112
- ireduce = threadIdx_reduce + (blockIdx_reduce - 1 ) * blockDim_reduce
113
- val = if ireduce <= length (Rreduce)
101
+ val = op (neutral, neutral)
102
+
103
+ # reduce serially across chunks of input vector that don't fit in a block
104
+ ireduce = threadIdx (). x
105
+ while ireduce <= length (Rreduce)
114
106
Ireduce = Rreduce[ireduce]
115
107
J = max (Iother, Ireduce)
116
- f (A[J])
117
- else
118
- neutral
108
+ val = op (val, f (A[J]))
109
+ ireduce += blockDim (). x
119
110
end
120
- val = op (val, neutral)
121
111
112
+ # reduce in parallel within the current block
122
113
val = reduce_block (op, val, neutral, shuffle)
123
114
124
115
# write back to memory
125
- if threadIdx_reduce == 1
116
+ if threadIdx () . x == 1
126
117
R[Iout] = val
127
118
end
128
119
end
@@ -142,8 +133,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, A::AbstractAr
142
133
# be conservative about using shuffle instructions
143
134
shuffle = true
144
135
shuffle &= capability (device ()) >= v " 3.0"
145
- shuffle &= T in (Int32, Int64, Float32, Float64, ComplexF32, ComplexF64)
146
- # TODO : add support for Bool (CUDAnative.jl#420)
136
+ shuffle &= T in (Bool, Int32, Int64, Float32, Float64, ComplexF32, ComplexF64)
147
137
148
138
# iteration domain, split in two: one part covers the dimensions that should
149
139
# be reduced, and the other covers the rest. combining both covers all values.
@@ -154,52 +144,21 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, A::AbstractAr
154
144
# CartesianIndices object with UnitRanges that behave badly on the GPU.
155
145
@assert length (Rall) == length (Rother) * length (Rreduce)
156
146
157
- # allocate an additional, empty dimension to write the reduced value to.
158
- # this does not affect the actual location in memory of the final values,
159
- # but allows us to write a generalized kernel supporting partial reductions.
160
- R′ = reshape (R, (size (R)... , 1 ))
161
-
162
- # determine how many threads we can launch
163
- args = (f, op, A, R′, init, Rreduce, Rother, 1 , Val (shuffle))
164
- kernel_args = cudaconvert .(args)
165
- kernel_tt = Tuple{Core. Typeof .(kernel_args)... }
166
- kernel = cufunction (partial_mapreduce_grid, kernel_tt)
167
- kernel_config =
168
- launch_configuration (kernel. fun; shmem = shuffle ? 0 : threads-> 2 * threads* sizeof (T))
169
-
170
- # determine the launch configuration
171
- dev = device ()
172
- reduce_threads = shuffle ? nextwarp (dev, length (Rreduce)) : nextpow (2 , length (Rreduce))
173
- if reduce_threads > kernel_config. threads
174
- reduce_threads = shuffle ? prevwarp (dev, kernel_config. threads) : prevpow (2 , kernel_config. threads)
175
- end
176
- reduce_blocks = cld (length (Rreduce), reduce_threads)
177
- other_blocks = length (Rother)
178
- threads, blocks = reduce_threads, reduce_blocks* other_blocks
179
- shmem = shuffle ? 0 : 2 * threads* sizeof (T)
180
-
181
- # perform the actual reduction
182
- if reduce_blocks == 1
183
- # we can cover the dimensions to reduce using a single block
184
- @cuda threads= threads blocks= blocks shmem= shmem partial_mapreduce_grid (
185
- f, op, A, R′, init, Rreduce, Rother, 1 , Val (shuffle))
186
- else
187
- # we need multiple steps to cover all values to reduce
188
- partial = similar (R, (size (R)... , reduce_blocks))
189
- if init === nothing
190
- # without an explicit initializer we need to copy from the output container
191
- sz = prod (size (R))
192
- for i in 1 : reduce_blocks
193
- # TODO : async copies (or async fill!, but then we'd need to load first)
194
- # or maybe just broadcast since that extends singleton dimensions
195
- copyto! (partial, (i- 1 )* sz+ 1 , R, 1 , sz)
196
- end
147
+ function configurator (kernel)
148
+ config = launch_configuration (kernel. fun)
149
+ dev = device ()
150
+
151
+ threads = shuffle ? nextwarp (dev, length (Rreduce)) : nextpow (2 , length (Rreduce))
152
+ if threads > config. threads
153
+ threads = shuffle ? prevwarp (dev, config. threads) : prevpow (2 , config. threads)
197
154
end
198
- @cuda threads = threads blocks= blocks shmem = shmem partial_mapreduce_grid (
199
- f, op, A, partial, init, Rreduce, Rother, reduce_blocks, Val (shuffle) )
155
+ blocks = length (Rother)
156
+ shmem = shuffle ? 0 : 2 * threads * sizeof (T )
200
157
201
- GPUArrays . mapreducedim! (identity, op, R′, partial, init )
158
+ return (threads = threads, blocks = blocks, shmem = shmem )
202
159
end
203
160
161
+ @cuda config= configurator mapreduce_grid (f, op, A, R, init, Rreduce, Rother, Val (shuffle))
162
+
204
163
return R
205
164
end
0 commit comments