WIP optimizer optimization

milancurcic · milancurcic · commit 38896cc57abc · 2025-05-27T11:53:57.000-04:00
diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
@@ -34,6 +34,7 @@ module nf_dense_layer
     procedure :: backward
     procedure :: forward
     procedure :: get_gradients
+    procedure :: get_gradients_ptr
     procedure :: get_num_params
     procedure :: get_params
     procedure :: get_params_ptr
@@ -112,6 +113,12 @@ module function get_gradients(self) result(gradients)
         !! Gradients of this layer
     end function get_gradients
 
+    module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+      class(dense_layer), intent(in), target :: self
+      real, pointer :: dw_ptr(:,:)
+      real, pointer :: db_ptr(:)
+    end subroutine get_gradients_ptr
+
     module subroutine set_params(self, params)
       !! Set the parameters of this layer.
       !! The parameters are ordered as weights first, biases second.
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
@@ -102,6 +102,15 @@ module function get_gradients(self) result(gradients)
   end function get_gradients
 
 
+  module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+    class(dense_layer), intent(in), target :: self
+    real, pointer :: dw_ptr(:,:)
+    real, pointer :: db_ptr(:)
+    dw_ptr => self % dw
+    db_ptr => self % db
+  end subroutine get_gradients_ptr
+
+
   module subroutine set_params(self, params)
     class(dense_layer), intent(in out) :: self
     real, intent(in), target :: params(:)
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
@@ -649,7 +649,7 @@ module subroutine update(self, optimizer, batch_size)
     integer, intent(in), optional :: batch_size
     integer :: batch_size_
     real, allocatable :: params(:)
-    real, pointer :: weights(:), biases(:), gradient(:)
+    real, pointer :: weights(:,:), biases(:), dw(:,:), db(:)
     integer :: n
 
     ! Passing the optimizer instance is optional. If not provided, and if the
@@ -702,7 +702,9 @@ module subroutine update(self, optimizer, batch_size)
       select type(this_layer => self % layers(n) % p)
         type is(dense_layer)
           call this_layer % get_params_ptr(weights, biases)
-          call self % optimizer % minimize(weights, biases, self % get_gradients() / batch_size_)
+          call this_layer % get_gradients_ptr(dw, db)
+          call self % optimizer % minimize(weights, dw / batch_size_)
+          call self % optimizer % minimize(biases, db / batch_size_)
           !call this_layer % set_params(weights, biases)
       end select
     end do
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
@@ -19,7 +19,9 @@ module nf_optimizers
     real :: learning_rate = 0.01
   contains
     procedure(init), deferred :: init
-    procedure(minimize), deferred :: minimize
+    procedure(minimize_1d), deferred :: minimize_1d
+    procedure(minimize_2d), deferred :: minimize_2d
+    generic :: minimize => minimize_1d, minimize_2d
   end type optimizer_base_type
 
   abstract interface
@@ -30,13 +32,19 @@ impure elemental subroutine init(self, num_params)
       integer, intent(in) :: num_params
     end subroutine init
 
-    pure subroutine minimize(self, weights, biases, gradient)
+    pure subroutine minimize_1d(self, param, gradient)
       import :: optimizer_base_type
       class(optimizer_base_type), intent(inout) :: self
-      real, intent(inout), pointer :: weights(:)
-      real, intent(inout), pointer :: biases(:)
-      real, intent(in), pointer :: gradient(:)
-    end subroutine minimize
+      real, intent(inout) :: param(:)
+      real, intent(in) :: gradient(:)
+    end subroutine minimize_1d
+
+    pure subroutine minimize_2d(self, param, gradient)
+      import :: optimizer_base_type
+      class(optimizer_base_type), intent(inout) :: self
+      real, intent(inout) :: param(:,:)
+      real, intent(in) :: gradient(:,:)
+    end subroutine minimize_2d
 
   end interface
 
@@ -47,7 +55,8 @@ end subroutine minimize
     real, allocatable, private :: velocity(:)
   contains
     procedure :: init => init_sgd
-    procedure :: minimize => minimize_sgd
+    procedure :: minimize_1d => minimize_sgd_1d
+    procedure :: minimize_2d => minimize_sgd_2d
   end type sgd
 
   type, extends(optimizer_base_type) :: rmsprop
@@ -62,7 +71,8 @@ end subroutine minimize
     real, allocatable, private :: rms_gradient(:)
   contains
     procedure :: init => init_rmsprop
-    procedure :: minimize => minimize_rmsprop
+    procedure :: minimize_1d => minimize_rmsprop_1d
+    procedure :: minimize_2d => minimize_rmsprop_2d
   end type rmsprop
 
   type, extends(optimizer_base_type) :: adam
@@ -85,7 +95,8 @@ end subroutine minimize
     integer, private :: t = 0
   contains
     procedure :: init => init_adam
-    procedure :: minimize => minimize_adam
+    procedure :: minimize_1d => minimize_adam_1d
+    procedure :: minimize_2d => minimize_adam_2d
   end type adam
 
   type, extends(optimizer_base_type) :: adagrad
@@ -102,7 +113,8 @@ end subroutine minimize
     integer, private :: t = 0
   contains
     procedure :: init => init_adagrad
-    procedure :: minimize => minimize_adagrad
+    procedure :: minimize_1d => minimize_adagrad_1d
+    procedure :: minimize_2d => minimize_adagrad_2d
   end type adagrad
 
 contains
@@ -117,35 +129,30 @@ impure elemental subroutine init_sgd(self, num_params)
   end subroutine init_sgd
 
 
-  pure subroutine minimize_sgd(self, weights, biases, gradient)
+  pure subroutine minimize_sgd_1d(self, param, gradient)
     !! Concrete implementation of a stochastic gradient descent optimizer
     !! update rule.
     class(sgd), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     if (self % momentum > 0) then
       ! Apply momentum update
       self % velocity = self % momentum * self % velocity &
         - self % learning_rate * gradient
       if (self % nesterov) then
         ! Apply Nesterov update
-        weights = weights + self % momentum * self % velocity &
-          - self % learning_rate * gradient
-        biases = biases + self % momentum * self % velocity &
+        param = param + self % momentum * self % velocity &
           - self % learning_rate * gradient
       else
-        weights = weights + self % velocity
-        biases = biases + self % velocity
+        param = param + self % velocity
       end if
     else
       ! Apply regular update
-      weights = weights - self % learning_rate * gradient
-      biases = biases - self % learning_rate * gradient
+      param = param - self % learning_rate * gradient
     end if
 
-  end subroutine minimize_sgd
+  end subroutine minimize_sgd_1d
 
 
   impure elemental subroutine init_rmsprop(self, num_params)
@@ -158,24 +165,21 @@ impure elemental subroutine init_rmsprop(self, num_params)
   end subroutine init_rmsprop
 
 
-  pure subroutine minimize_rmsprop(self, weights, biases, gradient)
+  pure subroutine minimize_rmsprop_1d(self, param, gradient)
     !! Concrete implementation of a RMSProp optimizer update rule.
     class(rmsprop), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     ! Compute the RMS of the gradient using the RMSProp rule
     self % rms_gradient = self % decay_rate * self % rms_gradient &
       + (1 - self % decay_rate) * gradient**2
 
     ! Update the network parameters based on the new RMS of the gradient
-    weights = weights - self % learning_rate &
-      / sqrt(self % rms_gradient + self % epsilon) * gradient
-    biases = biases - self % learning_rate &
+    param = param - self % learning_rate &
       / sqrt(self % rms_gradient + self % epsilon) * gradient
 
-  end subroutine minimize_rmsprop
+  end subroutine minimize_rmsprop_1d
 
 
   impure elemental subroutine init_adam(self, num_params)
@@ -189,18 +193,17 @@ impure elemental subroutine init_adam(self, num_params)
   end subroutine init_adam
 
 
-  pure subroutine minimize_adam(self, weights, biases, gradient)
+  pure subroutine minimize_adam_1d(self, param, gradient)
     !! Concrete implementation of an Adam optimizer update rule.
     class(adam), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     self % t = self % t + 1
 
     ! If weight_decay_l2 > 0, use L2 regularization;
     ! otherwise, default to regular Adam.
-    associate(g => gradient + self % weight_decay_l2 * weights)
+    associate(g => gradient + self % weight_decay_l2 * param)
       self % m = self % beta1 * self % m + (1 - self % beta1) * g
       self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
     end associate
@@ -212,19 +215,13 @@ pure subroutine minimize_adam(self, weights, biases, gradient)
     )
 
     ! Update parameters.
-    weights = weights &
+    param = param &
       - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) &
-      + self % weight_decay_decoupled * weights)
-    
-    ! Update biases (without weight decay for biases)
-    associate(g => gradient)
-      biases = biases &
-        - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon))
-    end associate
+      + self % weight_decay_decoupled * param)
 
     end associate
 
-  end subroutine minimize_adam
+  end subroutine minimize_adam_1d
 
 
   impure elemental subroutine init_adagrad(self, num_params)
@@ -237,43 +234,133 @@ impure elemental subroutine init_adagrad(self, num_params)
   end subroutine init_adagrad
 
 
-  pure subroutine minimize_adagrad(self, weights, biases, gradient)
+  pure subroutine minimize_adagrad_1d(self, param, gradient)
     !! Concrete implementation of an Adagrad optimizer update rule.
     class(adagrad), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     ! Update the current time step
     self % t = self % t + 1
 
-    ! For weights
     associate( &
       ! If weight_decay_l2 > 0, use L2 regularization;
       ! otherwise, default to regular Adagrad.
-      g => gradient + self % weight_decay_l2 * weights, &
+      g => gradient + self % weight_decay_l2 * param, &
       ! Amortize the learning rate as function of the current time step.
       learning_rate => self % learning_rate &
         / (1 + (self % t - 1) * self % learning_rate_decay) &
     )
 
       self % sum_squared_gradient = self % sum_squared_gradient + g**2
 
-      weights = weights - learning_rate * g / (sqrt(self % sum_squared_gradient) &
+      param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) &
         + self % epsilon)
 
     end associate
-    
-    ! For biases (without weight decay)
+
+  end subroutine minimize_adagrad_1d
+
+
+  pure subroutine minimize_sgd_2d(self, param, gradient)
+    !! Concrete implementation of a stochastic gradient descent optimizer
+    !! update rule for 2D arrays.
+    class(sgd), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    if (self % momentum > 0) then
+      ! Apply momentum update
+      self % velocity = self % momentum * self % velocity &
+        - self % learning_rate * reshape(gradient, [size(gradient)])
+      if (self % nesterov) then
+        ! Apply Nesterov update
+        param = param + reshape(self % momentum * self % velocity &
+          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
+      else
+        param = param + reshape(self % velocity, shape(param))
+      end if
+    else
+      ! Apply regular update
+      param = param - self % learning_rate * gradient
+    end if
+
+  end subroutine minimize_sgd_2d
+
+
+  pure subroutine minimize_rmsprop_2d(self, param, gradient)
+    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
+    class(rmsprop), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    ! Compute the RMS of the gradient using the RMSProp rule
+    self % rms_gradient = self % decay_rate * self % rms_gradient &
+      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
+
+    ! Update the network parameters based on the new RMS of the gradient
+    param = param - self % learning_rate &
+      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
+
+  end subroutine minimize_rmsprop_2d
+
+
+  pure subroutine minimize_adam_2d(self, param, gradient)
+    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
+    class(adam), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    self % t = self % t + 1
+
+    ! If weight_decay_l2 > 0, use L2 regularization;
+    ! otherwise, default to regular Adam.
+    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
+      self % m = self % beta1 * self % m + (1 - self % beta1) * g
+      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
+    end associate
+
+    ! Compute bias-corrected first and second moment estimates.
+    associate( &
+      m_hat => self % m / (1 - self % beta1**self % t), &
+      v_hat => self % v / (1 - self % beta2**self % t) &
+    )
+
+    ! Update parameters.
+    param = param &
+      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
+      - self % learning_rate * self % weight_decay_decoupled * param
+
+    end associate
+
+  end subroutine minimize_adam_2d
+
+
+  pure subroutine minimize_adagrad_2d(self, param, gradient)
+    !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays.
+    class(adagrad), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    ! Update the current time step
+    self % t = self % t + 1
+
     associate( &
-      g => gradient, &
+      ! If weight_decay_l2 > 0, use L2 regularization;
+      ! otherwise, default to regular Adagrad.
+      g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]), &
+      ! Amortize the learning rate as function of the current time step.
       learning_rate => self % learning_rate &
         / (1 + (self % t - 1) * self % learning_rate_decay) &
     )
-      biases = biases - learning_rate * g / (sqrt(self % sum_squared_gradient) &
-        + self % epsilon)
+
+      self % sum_squared_gradient = self % sum_squared_gradient + g**2
+
+      param = param - learning_rate * reshape(g / (sqrt(self % sum_squared_gradient) &
+        + self % epsilon), shape(param))
+
     end associate
 
-  end subroutine minimize_adagrad
+  end subroutine minimize_adagrad_2d
 
 end module nf_optimizers