Merge pull request #515 from JuliaGPU/tb/cudnn_broadcast

maleadt · web-flow · commit 836de1d298ba · 2020-10-29T16:48:45.000+01:00
Fix CUDNN-optimized activation broadcasts
diff --git a/lib/cudnn/activation.jl b/lib/cudnn/activation.jl
@@ -22,16 +22,19 @@ end
 
 function cudnnActivationForward(x::DenseCuArray{T,N}, y::DenseCuArray{T,N}=x;
                                 mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
-                                coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0) where {T,N}
+                                coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=true,
+                                beta=false) where {T,N}
     cudnnActivationForward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
                            scalingParameter(T, alpha), TensorDesc(x), x,
                            scalingParameter(T, beta ), TensorDesc(y), y)
     return  y
 end
 
-function cudnnActivationBackward(x::DenseCuArray{T,N}, dx::DenseCuArray{T,N}, y::DenseCuArray{T,N}, dy::DenseCuArray{T,N}=dx;
+function cudnnActivationBackward(x::DenseCuArray{T,N}, dx::DenseCuArray{T,N},
+                                 y::DenseCuArray{T,N}, dy::DenseCuArray{T,N}=dx;
                                  mode=CUDNN_ACTIVATION_RELU, # CUDNN_ACTIVATION_IDENTITY will not work
-                                 coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0) where {T,N}
+                                 coeff=false, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1,
+                                 beta=false) where {T,N}
     cudnnActivationBackward(handle(), ActivationDesc(mode, T(coeff), reluNanOpt),
                             scalingParameter(T, alpha), TensorDesc( y),  y,
                             TensorDesc(dy), dy,
diff --git a/lib/cudnn/nnlib.jl b/lib/cudnn/nnlib.jl
@@ -233,24 +233,41 @@ meanpool!(y::DenseCuArray{T}, x::DenseCuArray{T}, pdims::PoolDims) where T<:CUDN
 
 # Activation
 
-# in-place for x
-Base.broadcasted(::typeof(NNlib.σ), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
-  (cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_SIGMOID, coeff=0.0); return x)
-
-Base.broadcasted(::typeof(NNlib.relu), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
-  (cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_RELU, coeff=0.0); return x)
-
-Base.broadcasted(::typeof(NNlib.tanh), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
-  (cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_TANH, coeff=0.0); return x)
-
-Base.broadcasted(::typeof(NNlib.relu6), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
-  (cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_CLIPPED_RELU, coeff=6.0); return x)
+using Base.Broadcast
+
+for (f, op) in [
+  CUDA.tanh       => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
+                                                       mode=CUDNN_ACTIVATION_TANH),
+  NNlib.σ         => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
+                                                       mode=CUDNN_ACTIVATION_SIGMOID),
+  NNlib.elu       => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
+                                                       mode=CUDNN_ACTIVATION_ELU),
+  NNlib.relu      => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
+                                                       mode=CUDNN_ACTIVATION_RELU),
+  NNlib.relu6     => (src,dst)->cudnnActivationForward(reshape4D(src), reshape4D(dst),
+                                                       mode=CUDNN_ACTIVATION_CLIPPED_RELU,
+                                                       coeff=6.0),
+  NNlib.leakyrelu => (src,dst)->cudnnOpTensor(CUDNN_OP_TENSOR_MAX, reshape4D(src),
+                                              reshape4D(src), reshape4D(dst),
+                                              alpha1=0.01)]
+  @eval begin
+    # in-place
+    function Base.materialize!(dst::DenseCuArray{<:CUDNNFloat},
+                               bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
+        $op(bc.args[1], dst)
+        return dst
+    end
 
-Base.broadcasted(::typeof(NNlib.elu), x::DenseCuArray{T}) where {T<:CUDNNFloat} =
-  (cudnnActivationForward(reshape4D(x), mode=CUDNN_ACTIVATION_ELU, coeff=1.0); return x)
+    # out of place
+    function Base.materialize(bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{DenseCuArray}})
+        ElType = Broadcast.combine_eltypes(bc.f, bc.args)
+        dst = similar(bc, ElType)
+        $op(bc.args[1], dst)
+        return dst
+    end
+  end
+end
 
 # CUDNN_ACTIVATION_IDENTITY does not work with cudnnActivationForward
-Base.broadcasted(::typeof(NNlib.identity), x::DenseCuArray{T}) where {T<:CUDNNFloat} = x
-
-Base.broadcasted(::typeof(NNlib.leakyrelu), x::DenseCuArray{T}, a=T(0.01)) where {T<:CUDNNFloat} =
-  (cudnnOpTensor(CUDNN_OP_TENSOR_MAX, reshape4D(x), reshape4D(x), reshape4D(x), alpha1=a); return x)
+# FIXME: put this optimization in GPUArrays' `copyto!` (like Base.Broadcast's `copyto!`)
+Base.broadcasted(::typeof(identity), x::DenseCuArray{T}) where {T<:CUDNNFloat} = x
diff --git a/lib/cudnn/tensor.jl b/lib/cudnn/tensor.jl
@@ -62,7 +62,7 @@ OpTensorDesc(op::cudnnOpTensorOp_t, a::DenseCuArray) = OpTensorDesc(op, eltype(a
 
 function cudnnOpTensor(op::cudnnOpTensorOp_t,
                        A::DenseCuArray{T,N}, B::DenseCuArray{T,N}, C::DenseCuArray{T,N};
-                       alpha1=1, alpha2=1, beta=0) where {T,N}
+                       alpha1=true, alpha2=true, beta=false) where {T,N}
     cudnnOpTensor(handle(), OpTensorDesc(op, T),
                   scalingParameter(T, alpha1), TensorDesc(A), A,
                   scalingParameter(T, alpha2), TensorDesc(B), B,
@@ -113,7 +113,7 @@ end
 
 function cudnnReduceTensor(op::cudnnReduceTensorOp_t,
                            A::DenseCuArray{T,N}, C::DenseCuArray{T,N};
-                           alpha=1, beta=0) where {T,N}
+                           alpha=true, beta=false) where {T,N}
     # indices = Array{UInt64, 1}(undef, N)
     indicesSizeInBytes = cudnnGetReductionIndicesSize(op, A, C)
     @workspace size=@argout(
diff --git a/test/cudnn.jl b/test/cudnn.jl
@@ -82,9 +82,24 @@ end
       @test testf(x -> f.(x), rand(Float64, dims))
     end
   end
+
   # softplus does not give `Inf` for large arguments
   x = CuArray([1000.])
   @test all(softplus.(x) .== x)
+
+  # optimized activation overwrote inputs
+  let
+    x = CUDA.ones(1)
+    @test Array(x) == [1f0]
+    tanh.(x)
+    @test Array(x) == [1f0]
+    y = tanh.(x)
+    @test Array(x) == [1f0]
+    @test Array(y) == [tanh(1f0)]
+    x .= tanh.(y)
+    @test Array(y) == [tanh(1f0)]
+    @test Array(x) == [tanh(tanh(1f0))]
+  end
 end
 
 @testset "Batchnorm" begin