Add option to set bias=false to use Zeros as bias

DrChainsaw · DrChainsaw · commit c09657aabe97 · 2020-11-08T21:46:58.000+01:00
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,6 @@
+# v0.11.3
+* Added option to set `bias` to [false](https://github.com/FluxML/Flux.jl/pull/1379) to eliminating `bias` from being trained. 
+
 # v0.11.2
 
 * Adds the [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser.
diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
@@ -24,7 +24,6 @@ ConvTranspose
 CrossCor
 SamePad
 flatten
-Flux.Zeros
 Flux.convfilter
 Flux.depthwiseconvfilter
 ```
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -83,7 +83,7 @@ extraChain(::Tuple{}, x) = ()
 
 
 """
-    Dense(in::Integer, out::Integer, σ = identity)
+    Dense(in::Integer, out::Integer, σ = identity; bias=true)
 
 Create a traditional `Dense` layer with parameters `W` and `b`.
 
@@ -92,6 +92,8 @@ Create a traditional `Dense` layer with parameters `W` and `b`.
 The input `x` must be a vector of length `in`, or a batch of vectors represented
 as an `in × N` matrix. The out `y` will be a vector or batch of length `out`.
 
+Setting `bias` to `false` will switch bias off for the layer.
+
 # Example
 ```
 julia> d = Dense(5, 2)
@@ -101,6 +103,9 @@ julia> d(rand(5))
 2-element Array{Float32,1}:
  -0.16210233
   0.123119034
+
+julia> d = Dense(5, 2; bias=false)
+Dense(5, 2)
 ```
 """
 struct Dense{F,S<:AbstractArray,T<:Union{Zeros, AbstractVector}}
@@ -112,8 +117,8 @@ end
 Dense(W, b) = Dense(W, b, identity)
 
 function Dense(in::Integer, out::Integer, σ = identity;
-               initW = glorot_uniform, initb = zeros)
-  return Dense(initW(out, in), initb(out), σ)
+               initW = glorot_uniform, initb = zeros, bias=true)
+  return Dense(initW(out, in), create_bias(bias, initb, out), σ)
 end
 
 @functor Dense
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -46,7 +46,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+Setting `bias` to `false` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -82,7 +82,7 @@ end
 
 Constructs the convolutional layer with user defined weight and bias arrays.
 
-Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+Setting `bias` to `false` would switch `bias` off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -102,15 +102,16 @@ Conv(weight = weight,
     σ = sigmoid)
 ```
 """
-function Conv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+function Conv(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
               stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(Conv, pad, size(w)[1:N-2], dilation, stride)
-  return Conv(σ, w, b, stride, pad, dilation)
+  bias = create_bias(b, zeros, size(w, N)) 
+  return Conv(σ, w, bias, stride, pad, dilation)
 end
 
-function Conv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+function Conv(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
               activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
   Conv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
@@ -131,7 +132,7 @@ convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
 
 function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
             init = glorot_uniform,  stride = 1, pad = 0, dilation = 1,
-            weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+            weight = convfilter(k, ch, init = init), bias = true) where N
 
   Conv(weight, bias, σ,
       stride = stride, pad = pad, dilation = dilation)
@@ -189,7 +190,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+Setting `bias` to `false` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -215,7 +216,7 @@ end
 Constructs the convolutional transpose layer with user defined weight and bias arrays.
 forward pass.
 
-Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+Setting `bias` to `false` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -226,22 +227,23 @@ indicating padding values for each spatial dimension at both the ends.
 
 For keyword-only constuctor, see also [`Conv`](@ref)
 """
-function ConvTranspose(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+function ConvTranspose(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
                       stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(ConvTranspose, pad, size(w)[1:N-2], dilation, stride)
-  return ConvTranspose(σ, w, b, stride, pad, dilation)
+  bias = create_bias(b, zeros, size(w, N)) 
+  return ConvTranspose(σ, w, bias, stride, pad, dilation)
 end
 
-function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+function ConvTranspose(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
                         activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
   ConvTranspose(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
 
 function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                       init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                      weight = convfilter(k, reverse(ch), init = init), bias = zeros(ch[2])) where N
+                      weight = convfilter(k, reverse(ch), init = init), bias = true) where N
 
   ConvTranspose(weight, bias, σ,
               stride = stride, pad = pad, dilation = dilation)
@@ -307,7 +309,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+Setting `bias` to `false` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -333,7 +335,7 @@ end
 Constructs the `DepthwiseConv` layer with user defined weight and bias arrays.
 forward pass.
 
-Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+Setting `bias` to `false` would switch `bias` off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -344,15 +346,16 @@ indicating padding values for each spatial dimension at both the ends.
 
 For keyword-only constuctor, see also [`Conv`](@ref)
 """
-function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+function DepthwiseConv(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
                       stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(DepthwiseConv, pad, size(w)[1:N-2], dilation, stride)
-  return DepthwiseConv(σ, w, b, stride, pad, dilation)
+  bias = create_bias(b, zeros, prod(size(w)[N-1:end])) 
+  return DepthwiseConv(σ, w, bias, stride, pad, dilation)
 end
 
-function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+function DepthwiseConv(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
                       activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
   DepthwiseConv(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
@@ -373,7 +376,7 @@ depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
 
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                       init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                      weight = depthwiseconvfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+                      weight = depthwiseconvfilter(k, ch, init = init), bias = true) where N
   @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
 
   return DepthwiseConv(
@@ -424,7 +427,7 @@ In other words, a 100×100 RGB image would be a `100×100×3×1` array,
 and a batch of 50 would be a `100×100×3×50` array.
 
 Accepts keyword arguments `weight` and `bias` to set the corresponding fields.
-Setting `bias` to `Flux.Zeros()` will switch bias off for the layer.
+Setting `bias` to `false` will switch bias off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -461,7 +464,7 @@ end
 Constructs the standard cross convolutional layer with user defined weight and bias
 arrays.
 
-Setting `bias` to `Flux.Zeros()` would switch `bias` off for the layer.
+Setting `bias` to `false` would switch `bias` off for the layer.
 
 Takes the keyword arguments `pad`, `stride` and `dilation`.
 For input dimension N,
@@ -472,22 +475,23 @@ indicating padding values for each spatial dimension at both the ends.
 
 For keyword-only constuctor, see also [`Conv`](@ref)
 """
-function CrossCor(w::AbstractArray{T,N}, b::Union{Zeros, AbstractVector{T}}, σ = identity;
+function CrossCor(w::AbstractArray{T,N}, b::Union{Bool, Zeros, AbstractVector{T}}, σ = identity;
                   stride = 1, pad = 0, dilation = 1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(CrossCor, pad, size(w)[1:N-2], dilation, stride)
-  return CrossCor(σ, w, b, stride, pad, dilation)
+  bias = create_bias(b, zeros, size(w, N)) 
+  return CrossCor(σ, w, bias, stride, pad, dilation)
 end
 
-function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Zeros, AbstractVector{T}},
+function CrossCor(;weight::AbstractArray{T,N}, bias::Union{Bool, Zeros, AbstractVector{T}},
                       activation = identity, stride = 1, pad = 0, dilation = 1) where {T,N}
   CrossCor(weight, bias, activation, stride = stride, pad = pad, dilation = dilation)
 end
 
 function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                   init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                  weight = convfilter(k, ch, init = init), bias = zeros(ch[2])) where N
+                  weight = convfilter(k, ch, init = init), bias = true) where N
 
   CrossCor(weight, bias, σ,
        stride = stride, pad = pad, dilation = dilation)
diff --git a/src/utils.jl b/src/utils.jl
@@ -176,6 +176,20 @@ zeros(T::Type, dims...) = Base.zeros(T, dims...)
 ones(dims...) = Base.ones(Float32, dims...)
 zeros(dims...) = Base.zeros(Float32, dims...)
 
+"""
+    create_bias(shallcreate::Bool, iftrue, dims...) 
+    create_bias(x, ::Any...)
+
+Return a bias parameter for a layer.
+
+Essentially handles the allowed input options for the `bias` keyword:
+    If `false`: Return the `Zeros` type which turns bias off.
+    If `true` : Return the result of `iftrue(dims)`.
+    If not a boolean, return self to handle the case of bias=somearray.
+"""
+create_bias(shallcreate::Bool, iftrue, dims...) = shallcreate ? iftrue(dims...) : Zeros()
+create_bias(x, ::Any...) = x  
+
 """
     unsqueeze(xs, dim)
 
diff --git a/src/zeros.jl b/src/zeros.jl
@@ -11,11 +11,14 @@ Useful to turn bias off for a forward pass of a layer.
 ## Examples
 
 ```julia-repl
-julia> bias_less_conv = Conv((2,2), 1=>3, bias = Flux.Zeros())
+julia> bias_less_conv = Conv((2,2), 1=>3; bias = false)
 Conv((2, 2), 1=>3)
 
-julia> bias_less_dense = Dense(10, 2, initb = Zeros)
-Dense(10, 2)
+julia> params(bias_less_conv) |> length
+1
+
+julia> bias_less_conv.bias
+Flux.Zeros()
 ```
 """
 struct Zeros end
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
@@ -46,10 +46,10 @@ end
 # Repeats from Conv, CrossCor
 
 # Just to give testset in gradtest meaningful labels
-ConvNoBias(args...) = Conv(args...; bias=Flux.Zeros())
-ConvTransposeNoBias(args...) = ConvTranspose(args...; bias=Flux.Zeros())
-CrossCorNoBias(args...) = CrossCor(args...; bias=Flux.Zeros())
-DepthwiseConvNoBias(args...) = DepthwiseConv(args...;bias=Flux.Zeros())
+ConvNoBias(args...) = Conv(args...; bias=false)
+ConvTransposeNoBias(args...) = ConvTranspose(args...; bias=false)
+CrossCorNoBias(args...) = CrossCor(args...; bias=false)
+DepthwiseConvNoBias(args...) = DepthwiseConv(args...;bias=false)
 r = rand(Float32, 28, 28, 1, 1)
 conv_layers = [Conv, ConvNoBias, ConvTranspose, ConvTransposeNoBias, CrossCor, CrossCorNoBias, DepthwiseConv, DepthwiseConvNoBias]
 gradtest("Conv", conv_layers, r, (2,2), 1=>3)
@@ -102,7 +102,7 @@ end
 end
 
 @testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv)
-  l = cl((2,2), 1=>3, bias = Flux.Zeros()) |> gpu
+  l = cl((2,2), 1=>3, bias = false) |> gpu
   ip = zeros(Float32, 28,28,1,1) |> gpu
   if cl in BROKEN_LAYERS
     @test_broken sum(l(ip)) ≈ 0.f0
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
@@ -45,7 +45,7 @@ import Flux: activations
     @test Dense(10, 1, identity, initW = ones, initb = zeros)(ones(10,2)) == 10*ones(1, 2)
     @test Dense(10, 2, identity, initW = ones, initb = zeros)(ones(10,1)) == 10*ones(2, 1)
     @test Dense(10, 2, identity, initW = ones, initb = zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
-    @test Dense(10, 2, identity, initW = ones, initb = Zeros)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
+    @test Dense(10, 2, identity, initW = ones, bias = false)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
   end
 
   @testset "Diagonal" begin
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
@@ -42,8 +42,8 @@ end
   op = bias(ip)
   @test sum(op) == prod(size(op))
 
-  @testset "Zeros mapped through $lmap" for lmap in (identity, cpu, f32)
-    bias = Conv((2,2), 1=>3, bias = Flux.Zeros()) |> lmap
+  @testset "No bias mapped through $lmap" for lmap in (identity, cpu, f32)
+    bias = Conv((2,2), 1=>3, bias = false) |> lmap
     op = bias(ip)
     @test sum(op) ≈ 0.f0
     gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
@@ -52,7 +52,7 @@ end
 
   # Train w/o bias and make sure no convergence happens
   # when only bias can be converged
-  bias = Conv((2, 2), 1=>3, bias = Flux.Zeros());
+  bias = Conv((2, 2), 1=>3, bias = false);
   ip = zeros(Float32, 28,28,1,1)
   op = zeros(Float32, 27,27,3,1) .+ 2.f0
   opt = Descent()
@@ -87,8 +87,11 @@ end
   m1 = DepthwiseConv((2, 2), 3=>15)
   @test size(m1(r), 3) == 15
 
-  m3 = DepthwiseConv((2, 3), 3=>9)
-  @test size(m3(r), 3) == 9
+  m2 = DepthwiseConv((2, 3), 3=>9)
+  @test size(m2(r), 3) == 9
+
+  m3 = DepthwiseConv((2, 3), 3=>9; bias=false)
+  @test size(m2(r), 3) == 9
 
   # Test that we cannot ask for non-integer multiplication factors
   @test_throws AssertionError DepthwiseConv((2,2), 3=>10)
@@ -97,8 +100,9 @@ end
 @testset "ConvTranspose" begin
   x = zeros(Float32, 28, 28, 1, 1)
   y = Conv((3,3), 1 => 1)(x)
-  x_hat = ConvTranspose((3, 3), 1 => 1)(y)
-  @test size(x_hat) == size(x)
+  x_hat1 = ConvTranspose((3, 3), 1 => 1)(y)
+  x_hat2 = ConvTranspose((3, 3), 1 => 1, bias=false)(y)
+  @test size(x_hat1) == size(x_hat2) == size(x)
 
   m = ConvTranspose((3,3), 1=>1)
   # Test that the gradient call does not throw: #900
@@ -116,7 +120,7 @@ end
   m = Chain(
     CrossCor((2, 2), 1=>16, relu),
     MaxPool((2,2)),
-    CrossCor((2, 2), 16=>8, relu),
+    CrossCor((2, 2), 16=>8, relu; bias=false),
     MaxPool((2,2)),
     x -> reshape(x, :, size(x, 4)),
     Dense(288, 10), softmax)
diff --git a/test/utils.jl b/test/utils.jl
@@ -130,7 +130,7 @@ end
 end
 
 @testset "Zeros" begin
-  m = Dense(randn(2,3), Zeros())
+  m = Dense(3,2; bias=false)
   @test f64(m).b === m.b === Zeros()
   @test f32(m).b === m.b === Zeros()