Skip to content

Commit 6adc1c2

Browse files
Added Adam optimizer implementation (#150)
* Added Adam optimizer implementation * called rms from optimizers module * Minor fixes to improve adam performance * Suggested Changes * Remove dead code; format to <=80 columns * Use associate instead of explicit allocation for m_hat and v_hat; formatting * Added convergency test for Adam * AdamW: Adam with decay weights modification * AdamW Modifications * Fixed failing test * Add notes; clean up; make more internal parameters private * AdamW changes * flexible weight decay regularization * Formatting --------- Co-authored-by: milancurcic <[email protected]>
1 parent e9bfbd6 commit 6adc1c2

File tree

6 files changed

+230
-74
lines changed

6 files changed

+230
-74
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
1818
* Training and inference of dense (fully connected) and convolutional neural
1919
networks
2020
* Stochastic gradient descent optimizers: Classic, momentum, Nesterov momentum,
21-
and RMSProp
21+
RMSProp, Adam, AdamW
2222
* More than a dozen activation functions and their derivatives
2323
* Loading dense and convolutional models from Keras HDF5 (.h5) files
2424
* Data-based parallelism
@@ -287,4 +287,4 @@ group.
287287

288288
Neural-fortran has been used successfully in over a dozen published studies.
289289
See all papers that cite it
290-
[here](https://scholar.google.com/scholar?cites=7315840714744905948).
290+
[here](https://scholar.google.com/scholar?cites=7315840714744905948).

example/quadratic.f90

+129-64
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ program quadratic_fit
44
! descent.
55
use nf, only: dense, input, network
66
use nf_dense_layer, only: dense_layer
7-
use nf_optimizers, only: sgd
7+
use nf_optimizers, only: sgd, rmsprop, adam
88

99
implicit none
10-
type(network) :: net(6)
10+
type(network) :: net(9)
1111

1212
! Training parameters
1313
integer, parameter :: num_epochs = 1000
@@ -16,6 +16,9 @@ program quadratic_fit
1616
integer, parameter :: batch_size = 100
1717
real, parameter :: learning_rate = 0.01
1818
real, parameter :: decay_rate = 0.9
19+
real, parameter :: beta1 = 0.85
20+
real, parameter :: beta2 = 0.95
21+
real, parameter :: epsilon = 1e-8
1922

2023
! Input and output data
2124
real, allocatable :: x(:), y(:) ! training data
@@ -51,19 +54,46 @@ program quadratic_fit
5154
call sgd_optimizer(net(1), x, y, xtest, ytest, learning_rate, num_epochs)
5255

5356
! SGD, momentum
54-
call sgd_optimizer(net(2), x, y, xtest, ytest, learning_rate, num_epochs, momentum=0.9)
57+
call sgd_optimizer( &
58+
net(2), x, y, xtest, ytest, learning_rate, num_epochs, momentum=0.9 &
59+
)
5560

5661
! SGD, momentum with Nesterov
57-
call sgd_optimizer(net(3), x, y, xtest, ytest, learning_rate, num_epochs, momentum=0.9, nesterov=.true.)
62+
call sgd_optimizer( &
63+
net(3), x, y, xtest, ytest, learning_rate, num_epochs, &
64+
momentum=0.9, nesterov=.true. &
65+
)
5866

5967
! Batch SGD optimizer
6068
call batch_gd_optimizer(net(4), x, y, xtest, ytest, learning_rate, num_epochs)
6169

6270
! Mini-batch SGD optimizer
63-
call minibatch_gd_optimizer(net(5), x, y, xtest, ytest, learning_rate, num_epochs, batch_size)
71+
call minibatch_gd_optimizer( &
72+
net(5), x, y, xtest, ytest, learning_rate, num_epochs, batch_size &
73+
)
6474

6575
! RMSProp optimizer
66-
call rmsprop_optimizer(net(6), x, y, xtest, ytest, learning_rate, num_epochs, decay_rate)
76+
call rmsprop_optimizer( &
77+
net(6), x, y, xtest, ytest, learning_rate, num_epochs, decay_rate &
78+
)
79+
80+
! Adam optimizer
81+
call adam_optimizer( &
82+
net(7), x, y, xtest, ytest, learning_rate, num_epochs, &
83+
beta1, beta2, epsilon &
84+
)
85+
86+
! Adam optimizer with L2 regularization
87+
call adam_optimizer( &
88+
net(8), x, y, xtest, ytest, learning_rate, num_epochs, &
89+
beta1, beta2, epsilon, weight_decay_l2=1e-4 &
90+
)
91+
92+
! Adam optimizer with decoupled weight decay regularization
93+
call adam_optimizer( &
94+
net(9), x, y, xtest, ytest, learning_rate, num_epochs, &
95+
beta1, beta2, epsilon, weight_decay_decoupled=1e-5 &
96+
)
6797

6898
contains
6999

@@ -73,7 +103,9 @@ real elemental function quadratic(x) result(y)
73103
y = (x**2 / 2 + x / 2 + 1) / 2
74104
end function quadratic
75105

76-
subroutine sgd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, momentum, nesterov)
106+
subroutine sgd_optimizer( &
107+
net, x, y, xtest, ytest, learning_rate, num_epochs, momentum, nesterov &
108+
)
77109
! In the stochastic gradient descent (SGD) optimizer, we run the forward
78110
! and backward passes and update the weights for each training sample,
79111
! one at a time.
@@ -109,12 +141,19 @@ subroutine sgd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, mom
109141
do i = 1, size(x)
110142
call net % forward([x(i)])
111143
call net % backward([y(i)])
112-
call net % update(sgd(learning_rate=learning_rate, momentum=momentum_value, nesterov=nesterov_value))
144+
call net % update( &
145+
sgd( &
146+
learning_rate=learning_rate, &
147+
momentum=momentum_value, &
148+
nesterov=nesterov_value &
149+
) &
150+
)
113151
end do
114152

115153
if (mod(n, num_epochs / 10) == 0) then
116154
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
117-
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
155+
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', &
156+
n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
118157
end if
119158

120159
end do
@@ -123,7 +162,9 @@ subroutine sgd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, mom
123162

124163
end subroutine sgd_optimizer
125164

126-
subroutine batch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs)
165+
subroutine batch_gd_optimizer( &
166+
net, x, y, xtest, ytest, learning_rate, num_epochs &
167+
)
127168
! Like the stochastic gradient descent (SGD) optimizer, except that here we
128169
! accumulate the weight gradients for all training samples and update the
129170
! weights once per epoch.
@@ -147,7 +188,8 @@ subroutine batch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs
147188

148189
if (mod(n, num_epochs / 10) == 0) then
149190
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
150-
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
191+
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', &
192+
n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
151193
end if
152194

153195
end do
@@ -156,7 +198,9 @@ subroutine batch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs
156198

157199
end subroutine batch_gd_optimizer
158200

159-
subroutine minibatch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, batch_size)
201+
subroutine minibatch_gd_optimizer( &
202+
net, x, y, xtest, ytest, learning_rate, num_epochs, batch_size &
203+
)
160204
! Like the batch SGD optimizer, except that here we accumulate the weight
161205
! over a number of mini batches and update the weights once per mini batch.
162206
!
@@ -203,7 +247,8 @@ subroutine minibatch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_ep
203247

204248
if (mod(n, num_epochs / 10) == 0) then
205249
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
206-
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
250+
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', &
251+
n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
207252
end if
208253

209254
end do
@@ -212,79 +257,36 @@ subroutine minibatch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_ep
212257

213258
end subroutine minibatch_gd_optimizer
214259

215-
subroutine rmsprop_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, decay_rate)
260+
subroutine rmsprop_optimizer( &
261+
net, x, y, xtest, ytest, learning_rate, num_epochs, decay_rate &
262+
)
216263
! RMSprop optimizer for updating weights using root mean square
217264
type(network), intent(inout) :: net
218265
real, intent(in) :: x(:), y(:)
219266
real, intent(in) :: xtest(:), ytest(:)
220267
real, intent(in) :: learning_rate, decay_rate
221268
integer, intent(in) :: num_epochs
222269
integer :: i, j, n
223-
real, parameter :: epsilon = 1e-8 ! Small constant to avoid division by zero
224270
real, allocatable :: ypred(:)
225271

226-
! Define a dedicated type to store the RMSprop gradients.
227-
! This is needed because array sizes vary between layers and we need to
228-
! keep track of gradients for each layer over time.
229-
! For now this works only for dense layers.
230-
! We will need to define a similar type for conv2d layers.
231-
type :: rms_gradient_dense
232-
real, allocatable :: dw(:,:)
233-
real, allocatable :: db(:)
234-
end type rms_gradient_dense
235-
236-
type(rms_gradient_dense), allocatable :: rms(:)
237-
238272
print '(a)', 'RMSProp optimizer'
239273
print '(34("-"))'
240274

241-
! Here we allocate the array or RMS gradient derived types.
242-
! We need one for each dense layer, however we will allocate it to the
243-
! length of all layers as it will make housekeeping easier.
244-
allocate(rms(size(net % layers)))
245-
246275
do n = 1, num_epochs
247276

248277
do i = 1, size(x)
249278
call net % forward([x(i)])
250279
call net % backward([y(i)])
251280
end do
252281

253-
! RMSprop update rule
254-
do j = 1, size(net % layers)
255-
select type (this_layer => net % layers(j) % p)
256-
type is (dense_layer)
257-
258-
! If this is our first time here for this layer, allocate the
259-
! internal RMS gradient arrays and initialize them to zero.
260-
if (.not. allocated(rms(j) % dw)) then
261-
allocate(rms(j) % dw, mold=this_layer % dw)
262-
allocate(rms(j) % db, mold=this_layer % db)
263-
rms(j) % dw = 0
264-
rms(j) % db = 0
265-
end if
266-
267-
! Update the RMS gradients using the RMSprop moving average rule
268-
rms(j) % dw = decay_rate * rms(j) % dw + (1 - decay_rate) * this_layer % dw**2
269-
rms(j) % db = decay_rate * rms(j) % db + (1 - decay_rate) * this_layer % db**2
270-
271-
! Update weights and biases using the RMSprop update rule
272-
this_layer % weights = this_layer % weights - learning_rate &
273-
/ sqrt(rms(j) % dw + epsilon) * this_layer % dw
274-
this_layer % biases = this_layer % biases - learning_rate &
275-
/ sqrt(rms(j) % db + epsilon) * this_layer % db
276-
277-
! We have updated the weights and biases, so we need to reset the
278-
! gradients to zero for the next epoch.
279-
this_layer % dw = 0
280-
this_layer % db = 0
281-
282-
end select
283-
end do
282+
call net % update( &
283+
rmsprop(learning_rate=learning_rate, decay_rate=decay_rate) &
284+
)
284285

285286
if (mod(n, num_epochs / 10) == 0) then
286287
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
287-
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
288+
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', &
289+
n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
288290
end if
289291

290292
end do
@@ -293,6 +295,69 @@ subroutine rmsprop_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs,
293295

294296
end subroutine rmsprop_optimizer
295297

298+
subroutine adam_optimizer( &
299+
net, x, y, xtest, ytest, learning_rate, num_epochs, beta1, beta2, epsilon, &
300+
weight_decay_l2, weight_decay_decoupled &
301+
)
302+
! Adam optimizer
303+
type(network), intent(inout) :: net
304+
real, intent(in) :: x(:), y(:)
305+
real, intent(in) :: xtest(:), ytest(:)
306+
real, intent(in) :: learning_rate, beta1, beta2, epsilon
307+
real, intent(in), optional :: weight_decay_l2
308+
real, intent(in), optional :: weight_decay_decoupled
309+
integer, intent(in) :: num_epochs
310+
real, allocatable :: ypred(:)
311+
integer :: i, n
312+
real :: weight_decay_l2_val
313+
real :: weight_decay_decoupled_val
314+
315+
! Set default values for weight_decay_l2
316+
if (.not. present(weight_decay_l2)) then
317+
weight_decay_l2_val = 0.0
318+
else
319+
weight_decay_l2_val = weight_decay_l2
320+
end if
321+
322+
! Set default values for weight_decay_decoupled
323+
if (.not. present(weight_decay_decoupled)) then
324+
weight_decay_decoupled_val = 0.0
325+
else
326+
weight_decay_decoupled_val = weight_decay_decoupled
327+
end if
328+
329+
print '(a)', 'Adam optimizer'
330+
print '(34("-"))'
331+
332+
do n = 1, num_epochs
333+
do i = 1, size(x)
334+
call net % forward([x(i)])
335+
call net % backward([y(i)])
336+
end do
337+
338+
call net % update( &
339+
adam( &
340+
learning_rate=learning_rate, &
341+
beta1=beta1, &
342+
beta2=beta2, &
343+
epsilon=epsilon, &
344+
weight_decay_l2=weight_decay_l2_val, &
345+
weight_decay_decoupled=weight_decay_decoupled_val &
346+
) &
347+
)
348+
349+
if (mod(n, num_epochs / 10) == 0) then
350+
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
351+
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', &
352+
n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
353+
end if
354+
355+
end do
356+
357+
print *, ''
358+
359+
end subroutine adam_optimizer
360+
296361
subroutine shuffle(arr)
297362
! Shuffle an array using the Fisher-Yates algorithm.
298363
integer, intent(inout) :: arr(:)

fpm.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name = "neural-fortran"
2-
version = "0.13.0"
2+
version = "0.14.0"
33
license = "MIT"
44
author = "Milan Curcic"
55
maintainer = "[email protected]"

src/nf.f90

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ module nf
55
use nf_layer_constructors, only: &
66
conv2d, dense, flatten, input, maxpool2d, reshape
77
use nf_network, only: network
8-
use nf_optimizers, only: sgd, rmsprop
8+
use nf_optimizers, only: sgd, rmsprop, adam
99
use nf_activation, only: activation_function, elu, exponential, &
1010
gaussian, linear, relu, leaky_relu, &
1111
sigmoid, softmax, softplus, step, tanhf, &

0 commit comments

Comments
 (0)