FluxML · ludvigk · Nov 23, 2021 · Nov 23, 2021 · Nov 23, 2021 · May 20, 2022
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
@@ -110,40 +110,55 @@ function apply!(o::Nesterov, x, Δ)
 end
 
 """
-    RMSProp(η = 0.001, ρ = 0.9, ϵ = $EPS)
+    RMSProp(η = 0.001, ρ = 0.9, ϵ = $EPS, centered = false)
 
 Optimizer using the
 [RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 algorithm. Often a good choice for recurrent networks. Parameters other than learning rate
 generally don't need tuning.
 
+The [centerd version](https://arxiv.org/pdf/1308.0850v5.pdf) of RMSProp maintains a moving
+average of the gradient to center the second order moment used for normalization.
+
 # Parameters
 - Learning rate (`η`): Amount by which gradients are discounted before updating
                        the weights.
 - Momentum (`ρ`): Controls the acceleration of gradient descent in the
-                  prominent direction, in effect damping oscillations.
+                  prominent direction, in effect dampening oscillations.
+- Centered (`centered`): Whether to use the centered version of RMSProp.
 
 # Examples
 ```julia
 opt = RMSProp()
 
-opt = RMSProp(0.002, 0.95)
+opt = RMSProp(0.002, 0.95, true)
 ```
 """
 mutable struct RMSProp <: AbstractOptimiser
   eta::Float64
   rho::Float64
+  centered::Bool
   epsilon::Float64
   acc::IdDict
 end
-RMSProp(η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = EPS) = RMSProp(η, ρ, ϵ, IdDict())
-RMSProp(η::Real, ρ::Real, acc::IdDict) = RMSProp(η, ρ, EPS, acc)
+
+RMSProp(η::Real = 0.001, ρ::Real = 0.9, centered::Bool = false, ϵ::Real = EPS) = RMSProp(η, ρ, centered, ϵ, IdDict())
+RMSProp(η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = EPS; centered::Bool = false) = RMSProp(η, ρ, centered, ϵ, IdDict())
+RMSProp(η::Real, ρ::Real, acc::IdDict; centered::Bool = false) = RMSProp(η, ρ, EPS, centered, acc)
+
 
 function apply!(o::RMSProp, x, Δ)
   η, ρ = o.eta, o.rho
-  acc = get!(() -> zero(x), o.acc, x)::typeof(x)
+
+  acc, Δ_ave = get!(o.acc, x) do
+    (zero(x), zero(x))
+  end :: Tuple{typeof(x),typeof(x)}
+
   @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ)
-  @. Δ *= η / (√acc + o.epsilon)
+  if o.centered
+    @. Δ_ave = ρ * Δ_ave + (1 - ρ) * Δ
+  end
+  @. Δ *= η / (√(acc - Δ_ave * conj(Δ_ave)) + o.epsilon)
 end
 
 """
@@ -175,7 +190,6 @@ ADAM(η::Real, β::Tuple, state::IdDict) = ADAM(η, β, EPS, state)
 
 function apply!(o::ADAM, x, Δ)
   η, β = o.eta, o.beta
-
   mt, vt, βp = get!(o.state, x) do
       (zero(x), zero(x), Float64[β[1], β[2]])
   end :: Tuple{typeof(x),typeof(x),Vector{Float64}}