Skip to content

Commit 836de1d

Browse files
authored
Merge pull request #515 from JuliaGPU/tb/cudnn_broadcast
Fix CUDNN-optimized activation broadcasts
2 parents 9465f3b + beffb7d commit 836de1d

File tree

4 files changed

+58
-23
lines changed

4 files changed

+58
-23
lines changed

lib/cudnn/activation.jl

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,19 @@ end
2222

2323
function cudnnActivationForward(x::DenseCuArray{T,N}, y::DenseCuArray{T,N}=x;
2424
mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
25-
coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0) where {T,N}
25+
coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=true,
26+
beta=false) where {T,N}
2627
cudnnActivationForward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
2728
scalingParameter(T, alpha), TensorDesc(x), x,
2829
scalingParameter(T, beta ), TensorDesc(y), y)
2930
return y
3031
end
3132

32-
function cudnnActivationBackward(x::DenseCuArray{T,N}, dx::DenseCuArray{T,N}, y::DenseCuArray{T,N}, dy::DenseCuArray{T,N}=dx;
33+
function cudnnActivationBackward(x::DenseCuArray{T,N}, dx::DenseCuArray{T,N},
34+
y::DenseCuArray{T,N}, dy::DenseCuArray{T,N}=dx;
3335
mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
34-
coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0) where {T,N}
36+
coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1,
37+
beta=false) where {T,N}
3538
cudnnActivationBackward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
3639
scalingParameter(T, alpha), TensorDesc( y), y,
3740
TensorDesc(dy), dy,

lib/cudnn/nnlib.jl

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -233,24 +233,41 @@ meanpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDN
233233

234234
# Activation
235235

236-
# in-place for x
237-
Base.broadcasted(::typeof(NNlib.σ), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
238-
(cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_SIGMOID, coeff=0.0); return x)
239-
240-
Base.broadcasted(::typeof(NNlib.relu), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
241-
(cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_RELU, coeff=0.0); return x)
242-
243-
Base.broadcasted(::typeof(NNlib.tanh), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
244-
(cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_TANH, coeff=0.0); return x)
245-
246-
Base.broadcasted(::typeof(NNlib.relu6), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
247-
(cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_CLIPPED_RELU, coeff=6.0); return x)
236+
using Base.Broadcast
237+
238+
for (f, op) in [
239+
CUDA.tanh => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
240+
mode=CUDNN_ACTIVATION_TANH),
241+
NNlib.σ => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
242+
mode=CUDNN_ACTIVATION_SIGMOID),
243+
NNlib.elu => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
244+
mode=CUDNN_ACTIVATION_ELU),
245+
NNlib.relu => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
246+
mode=CUDNN_ACTIVATION_RELU),
247+
NNlib.relu6 => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
248+
mode=CUDNN_ACTIVATION_CLIPPED_RELU,
249+
coeff=6.0),
250+
NNlib.leakyrelu => (src,dst)->cudnnOpTensor(CUDNN_OP_TENSOR_MAX, reshape4D(src),
251+
reshape4D(src), reshape4D(dst),
252+
alpha1=0.01)]
253+
@eval begin
254+
# in-place
255+
function Base.materialize!(dst::DenseCuArray{<:CUDNNFloat},
256+
bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
257+
$op(bc.args[1], dst)
258+
return dst
259+
end
248260

249-
Base.broadcasted(::typeof(NNlib.elu), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
250-
(cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_ELU, coeff=1.0); return x)
261+
# out of place
262+
function Base.materialize(bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
263+
ElType = Broadcast.combine_eltypes(bc.f, bc.args)
264+
dst = similar(bc, ElType)
265+
$op(bc.args[1], dst)
266+
return dst
267+
end
268+
end
269+
end
251270

252271
# CUDNN_ACTIVATION_IDENTITY does not work with cudnnActivationForward
253-
Base.broadcasted(::typeof(NNlib.identity), x::DenseCuArray{T}) where {T<:CUDNNFloat} = x
254-
255-
Base.broadcasted(::typeof(NNlib.leakyrelu), x::DenseCuArray{T}, a=T(0.01)) where {T<:CUDNNFloat} =
256-
(cudnnOpTensor(CUDNN_OP_TENSOR_MAX, reshape4D(x), reshape4D(x), reshape4D(x), alpha1=a); return x)
272+
# FIXME: put this optimization in GPUArrays' `copyto!` (like Base.Broadcast's `copyto!`)
273+
Base.broadcasted(::typeof(identity), x::DenseCuArray{T}) where {T<:CUDNNFloat} = x

lib/cudnn/tensor.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ OpTensorDesc(op::cudnnOpTensorOp_t, a::DenseCuArray) = OpTensorDesc(op, eltype(a
6262

6363
function cudnnOpTensor(op::cudnnOpTensorOp_t,
6464
A::DenseCuArray{T,N}, B::DenseCuArray{T,N}, C::DenseCuArray{T,N};
65-
alpha1=1, alpha2=1, beta=0) where {T,N}
65+
alpha1=true, alpha2=true, beta=false) where {T,N}
6666
cudnnOpTensor(handle(), OpTensorDesc(op, T),
6767
scalingParameter(T, alpha1), TensorDesc(A), A,
6868
scalingParameter(T, alpha2), TensorDesc(B), B,
@@ -113,7 +113,7 @@ end
113113

114114
function cudnnReduceTensor(op::cudnnReduceTensorOp_t,
115115
A::DenseCuArray{T,N}, C::DenseCuArray{T,N};
116-
alpha=1, beta=0) where {T,N}
116+
alpha=true, beta=false) where {T,N}
117117
# indices = Array{UInt64, 1}(undef, N)
118118
indicesSizeInBytes = cudnnGetReductionIndicesSize(op, A, C)
119119
@workspace size=@argout(

test/cudnn.jl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,24 @@ end
8282
@test testf(x -> f.(x), rand(Float64, dims))
8383
end
8484
end
85+
8586
# softplus does not give `Inf` for large arguments
8687
x = CuArray([1000.])
8788
@test all(softplus.(x) .== x)
89+
90+
# optimized activation overwrote inputs
91+
let
92+
x = CUDA.ones(1)
93+
@test Array(x) == [1f0]
94+
tanh.(x)
95+
@test Array(x) == [1f0]
96+
y = tanh.(x)
97+
@test Array(x) == [1f0]
98+
@test Array(y) == [tanh(1f0)]
99+
x .= tanh.(y)
100+
@test Array(y) == [tanh(1f0)]
101+
@test Array(x) == [tanh(tanh(1f0))]
102+
end
88103
end
89104

90105
@testset "Batchnorm" begin

0 commit comments

Comments
 (0)