Skip to content
This repository was archived by the owner on Mar 12, 2021. It is now read-only.

Commit 6266f76

Browse files
committed
Support and use broadcast with mapreduce.
1 parent f44cf19 commit 6266f76

File tree

2 files changed

+19
-12
lines changed

2 files changed

+19
-12
lines changed

Manifest.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,9 @@ version = "0.1.1"
7777

7878
[[GPUArrays]]
7979
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
80-
git-tree-sha1 = "c63cb01e3b6f48ab39f1e35c31ba870650814a18"
80+
git-tree-sha1 = "5f2f5401051d7cfe64adf2cfaf68fbf3c1fc7d55"
81+
repo-rev = "tb/mapreduce_broadcast"
82+
repo-url = "https://github.com/JuliaGPU/GPUArrays.jl.git"
8183
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
8284
version = "3.2.0"
8385

@@ -92,6 +94,7 @@ uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
9294
version = "1.3.4"
9395

9496
[[LibGit2]]
97+
deps = ["Printf"]
9598
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
9699

97100
[[Libdl]]
@@ -127,7 +130,7 @@ uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
127130
version = "1.1.0"
128131

129132
[[Pkg]]
130-
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
133+
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
131134
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
132135

133136
[[Printf]]

src/mapreduce.jl

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -133,13 +133,17 @@ end
133133

134134
## COV_EXCL_STOP
135135

136-
NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractArray...; init=nothing) where T
137-
# TODO: Broadcast-semantics after JuliaLang-julia#31020
138-
A = first(As)
139-
all(B -> size(A) == size(B), As) || throw(DimensionMismatch("dimensions of containers must be identical"))
136+
if VERSION < v"1.5.0-DEV.748"
137+
Base.axes(bc::Base.Broadcast.Broadcasted{<:CuArrayStyle, <:NTuple{N}},
138+
d::Integer) where N =
139+
d <= N ? axes(bc)[d] : Base.OneTo(1)
140+
end
140141

142+
NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T},
143+
A::Union{AbstractArray,Broadcast.Broadcasted};
144+
init=nothing) where T
141145
Base.check_reducedims(R, A)
142-
isempty(A) && return R
146+
length(A) == 0 && return R # isempty(::Broadcasted) iterates
143147

144148
f = cufunc(f)
145149
op = cufunc(op)
@@ -156,8 +160,8 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
156160

157161
# iteration domain, split in two: one part covers the dimensions that should
158162
# be reduced, and the other covers the rest. combining both covers all values.
159-
Rall = CartesianIndices(A)
160-
Rother = CartesianIndices(R)
163+
Rall = CartesianIndices(axes(A))
164+
Rother = CartesianIndices(axes(R))
161165
Rreduce = CartesianIndices(ifelse.(axes(A) .== axes(R), Ref(Base.OneTo(1)), axes(A)))
162166
# NOTE: we hard-code `OneTo` (`first.(axes(A))` would work too) or we get a
163167
# CartesianIndices object with UnitRanges that behave badly on the GPU.
@@ -187,7 +191,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
187191
# we might not be able to launch all those threads to reduce each slice in one go.
188192
# that's why each threads also loops across their inputs, processing multiple values
189193
# so that we can span the entire reduction dimension using a single thread block.
190-
args = (f, op, init, Rreduce, Rother, Val(shuffle), R′, As...)
194+
args = (f, op, init, Rreduce, Rother, Val(shuffle), R′, A)
191195
kernel_args = cudaconvert.(args)
192196
kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
193197
kernel = cufunction(partial_mapreduce_grid, kernel_tt)
@@ -218,7 +222,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
218222
if reduce_blocks == 1
219223
# we can cover the dimensions to reduce using a single block
220224
@cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
221-
f, op, init, Rreduce, Rother, Val(shuffle), R′, As...)
225+
f, op, init, Rreduce, Rother, Val(shuffle), R′, A)
222226
else
223227
# we need multiple steps to cover all values to reduce
224228
partial = similar(R, (size(R)..., reduce_blocks))
@@ -232,7 +236,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
232236
end
233237
end
234238
@cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
235-
f, op, init, Rreduce, Rother, Val(shuffle), partial, As...)
239+
f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
236240

237241
GPUArrays.mapreducedim!(identity, op, R′, partial; init=init)
238242
end

0 commit comments

Comments
 (0)