@@ -4,10 +4,10 @@ program quadratic_fit
4
4
! descent.
5
5
use nf, only: dense, input, network
6
6
use nf_dense_layer, only: dense_layer
7
- use nf_optimizers, only: sgd
7
+ use nf_optimizers, only: sgd, rmsprop, adam
8
8
9
9
implicit none
10
- type (network) :: net(6 )
10
+ type (network) :: net(9 )
11
11
12
12
! Training parameters
13
13
integer , parameter :: num_epochs = 1000
@@ -16,6 +16,9 @@ program quadratic_fit
16
16
integer , parameter :: batch_size = 100
17
17
real , parameter :: learning_rate = 0.01
18
18
real , parameter :: decay_rate = 0.9
19
+ real , parameter :: beta1 = 0.85
20
+ real , parameter :: beta2 = 0.95
21
+ real , parameter :: epsilon = 1e-8
19
22
20
23
! Input and output data
21
24
real , allocatable :: x(:), y(:) ! training data
@@ -51,19 +54,46 @@ program quadratic_fit
51
54
call sgd_optimizer(net(1 ), x, y, xtest, ytest, learning_rate, num_epochs)
52
55
53
56
! SGD, momentum
54
- call sgd_optimizer(net(2 ), x, y, xtest, ytest, learning_rate, num_epochs, momentum= 0.9 )
57
+ call sgd_optimizer( &
58
+ net(2 ), x, y, xtest, ytest, learning_rate, num_epochs, momentum= 0.9 &
59
+ )
55
60
56
61
! SGD, momentum with Nesterov
57
- call sgd_optimizer(net(3 ), x, y, xtest, ytest, learning_rate, num_epochs, momentum= 0.9 , nesterov= .true. )
62
+ call sgd_optimizer( &
63
+ net(3 ), x, y, xtest, ytest, learning_rate, num_epochs, &
64
+ momentum= 0.9 , nesterov= .true. &
65
+ )
58
66
59
67
! Batch SGD optimizer
60
68
call batch_gd_optimizer(net(4 ), x, y, xtest, ytest, learning_rate, num_epochs)
61
69
62
70
! Mini-batch SGD optimizer
63
- call minibatch_gd_optimizer(net(5 ), x, y, xtest, ytest, learning_rate, num_epochs, batch_size)
71
+ call minibatch_gd_optimizer( &
72
+ net(5 ), x, y, xtest, ytest, learning_rate, num_epochs, batch_size &
73
+ )
64
74
65
75
! RMSProp optimizer
66
- call rmsprop_optimizer(net(6 ), x, y, xtest, ytest, learning_rate, num_epochs, decay_rate)
76
+ call rmsprop_optimizer( &
77
+ net(6 ), x, y, xtest, ytest, learning_rate, num_epochs, decay_rate &
78
+ )
79
+
80
+ ! Adam optimizer
81
+ call adam_optimizer( &
82
+ net(7 ), x, y, xtest, ytest, learning_rate, num_epochs, &
83
+ beta1, beta2, epsilon &
84
+ )
85
+
86
+ ! Adam optimizer with L2 regularization
87
+ call adam_optimizer( &
88
+ net(8 ), x, y, xtest, ytest, learning_rate, num_epochs, &
89
+ beta1, beta2, epsilon, weight_decay_l2= 1e-4 &
90
+ )
91
+
92
+ ! Adam optimizer with decoupled weight decay regularization
93
+ call adam_optimizer( &
94
+ net(9 ), x, y, xtest, ytest, learning_rate, num_epochs, &
95
+ beta1, beta2, epsilon, weight_decay_decoupled= 1e-5 &
96
+ )
67
97
68
98
contains
69
99
@@ -73,7 +103,9 @@ real elemental function quadratic(x) result(y)
73
103
y = (x** 2 / 2 + x / 2 + 1 ) / 2
74
104
end function quadratic
75
105
76
- subroutine sgd_optimizer (net , x , y , xtest , ytest , learning_rate , num_epochs , momentum , nesterov )
106
+ subroutine sgd_optimizer ( &
107
+ net , x , y , xtest , ytest , learning_rate , num_epochs , momentum , nesterov &
108
+ )
77
109
! In the stochastic gradient descent (SGD) optimizer, we run the forward
78
110
! and backward passes and update the weights for each training sample,
79
111
! one at a time.
@@ -109,12 +141,19 @@ subroutine sgd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, mom
109
141
do i = 1 , size (x)
110
142
call net % forward([x(i)])
111
143
call net % backward([y(i)])
112
- call net % update(sgd(learning_rate= learning_rate, momentum= momentum_value, nesterov= nesterov_value))
144
+ call net % update( &
145
+ sgd( &
146
+ learning_rate= learning_rate, &
147
+ momentum= momentum_value, &
148
+ nesterov= nesterov_value &
149
+ ) &
150
+ )
113
151
end do
114
152
115
153
if (mod (n, num_epochs / 10 ) == 0 ) then
116
154
ypred = [(net % predict([xtest(i)]), i = 1 , size (xtest))]
117
- print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
155
+ print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , &
156
+ n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
118
157
end if
119
158
120
159
end do
@@ -123,7 +162,9 @@ subroutine sgd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, mom
123
162
124
163
end subroutine sgd_optimizer
125
164
126
- subroutine batch_gd_optimizer (net , x , y , xtest , ytest , learning_rate , num_epochs )
165
+ subroutine batch_gd_optimizer ( &
166
+ net , x , y , xtest , ytest , learning_rate , num_epochs &
167
+ )
127
168
! Like the stochastic gradient descent (SGD) optimizer, except that here we
128
169
! accumulate the weight gradients for all training samples and update the
129
170
! weights once per epoch.
@@ -147,7 +188,8 @@ subroutine batch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs
147
188
148
189
if (mod (n, num_epochs / 10 ) == 0 ) then
149
190
ypred = [(net % predict([xtest(i)]), i = 1 , size (xtest))]
150
- print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
191
+ print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , &
192
+ n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
151
193
end if
152
194
153
195
end do
@@ -156,7 +198,9 @@ subroutine batch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs
156
198
157
199
end subroutine batch_gd_optimizer
158
200
159
- subroutine minibatch_gd_optimizer (net , x , y , xtest , ytest , learning_rate , num_epochs , batch_size )
201
+ subroutine minibatch_gd_optimizer ( &
202
+ net , x , y , xtest , ytest , learning_rate , num_epochs , batch_size &
203
+ )
160
204
! Like the batch SGD optimizer, except that here we accumulate the weight
161
205
! over a number of mini batches and update the weights once per mini batch.
162
206
!
@@ -203,7 +247,8 @@ subroutine minibatch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_ep
203
247
204
248
if (mod (n, num_epochs / 10 ) == 0 ) then
205
249
ypred = [(net % predict([xtest(i)]), i = 1 , size (xtest))]
206
- print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
250
+ print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , &
251
+ n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
207
252
end if
208
253
209
254
end do
@@ -212,79 +257,36 @@ subroutine minibatch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_ep
212
257
213
258
end subroutine minibatch_gd_optimizer
214
259
215
- subroutine rmsprop_optimizer (net , x , y , xtest , ytest , learning_rate , num_epochs , decay_rate )
260
+ subroutine rmsprop_optimizer ( &
261
+ net , x , y , xtest , ytest , learning_rate , num_epochs , decay_rate &
262
+ )
216
263
! RMSprop optimizer for updating weights using root mean square
217
264
type (network), intent (inout ) :: net
218
265
real , intent (in ) :: x(:), y(:)
219
266
real , intent (in ) :: xtest(:), ytest(:)
220
267
real , intent (in ) :: learning_rate, decay_rate
221
268
integer , intent (in ) :: num_epochs
222
269
integer :: i, j, n
223
- real , parameter :: epsilon = 1e-8 ! Small constant to avoid division by zero
224
270
real , allocatable :: ypred(:)
225
271
226
- ! Define a dedicated type to store the RMSprop gradients.
227
- ! This is needed because array sizes vary between layers and we need to
228
- ! keep track of gradients for each layer over time.
229
- ! For now this works only for dense layers.
230
- ! We will need to define a similar type for conv2d layers.
231
- type :: rms_gradient_dense
232
- real , allocatable :: dw(:,:)
233
- real , allocatable :: db(:)
234
- end type rms_gradient_dense
235
-
236
- type (rms_gradient_dense), allocatable :: rms(:)
237
-
238
272
print ' (a)' , ' RMSProp optimizer'
239
273
print ' (34("-"))'
240
274
241
- ! Here we allocate the array or RMS gradient derived types.
242
- ! We need one for each dense layer, however we will allocate it to the
243
- ! length of all layers as it will make housekeeping easier.
244
- allocate (rms(size (net % layers)))
245
-
246
275
do n = 1 , num_epochs
247
276
248
277
do i = 1 , size (x)
249
278
call net % forward([x(i)])
250
279
call net % backward([y(i)])
251
280
end do
252
281
253
- ! RMSprop update rule
254
- do j = 1 , size (net % layers)
255
- select type (this_layer = > net % layers(j) % p)
256
- type is (dense_layer)
257
-
258
- ! If this is our first time here for this layer, allocate the
259
- ! internal RMS gradient arrays and initialize them to zero.
260
- if (.not. allocated (rms(j) % dw)) then
261
- allocate (rms(j) % dw, mold= this_layer % dw)
262
- allocate (rms(j) % db, mold= this_layer % db)
263
- rms(j) % dw = 0
264
- rms(j) % db = 0
265
- end if
266
-
267
- ! Update the RMS gradients using the RMSprop moving average rule
268
- rms(j) % dw = decay_rate * rms(j) % dw + (1 - decay_rate) * this_layer % dw** 2
269
- rms(j) % db = decay_rate * rms(j) % db + (1 - decay_rate) * this_layer % db** 2
270
-
271
- ! Update weights and biases using the RMSprop update rule
272
- this_layer % weights = this_layer % weights - learning_rate &
273
- / sqrt (rms(j) % dw + epsilon) * this_layer % dw
274
- this_layer % biases = this_layer % biases - learning_rate &
275
- / sqrt (rms(j) % db + epsilon) * this_layer % db
276
-
277
- ! We have updated the weights and biases, so we need to reset the
278
- ! gradients to zero for the next epoch.
279
- this_layer % dw = 0
280
- this_layer % db = 0
281
-
282
- end select
283
- end do
282
+ call net % update( &
283
+ rmsprop(learning_rate= learning_rate, decay_rate= decay_rate) &
284
+ )
284
285
285
286
if (mod (n, num_epochs / 10 ) == 0 ) then
286
287
ypred = [(net % predict([xtest(i)]), i = 1 , size (xtest))]
287
- print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
288
+ print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , &
289
+ n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
288
290
end if
289
291
290
292
end do
@@ -293,6 +295,69 @@ subroutine rmsprop_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs,
293
295
294
296
end subroutine rmsprop_optimizer
295
297
298
+ subroutine adam_optimizer ( &
299
+ net , x , y , xtest , ytest , learning_rate , num_epochs , beta1 , beta2 , epsilon , &
300
+ weight_decay_l2 , weight_decay_decoupled &
301
+ )
302
+ ! Adam optimizer
303
+ type (network), intent (inout ) :: net
304
+ real , intent (in ) :: x(:), y(:)
305
+ real , intent (in ) :: xtest(:), ytest(:)
306
+ real , intent (in ) :: learning_rate, beta1, beta2, epsilon
307
+ real , intent (in ), optional :: weight_decay_l2
308
+ real , intent (in ), optional :: weight_decay_decoupled
309
+ integer , intent (in ) :: num_epochs
310
+ real , allocatable :: ypred(:)
311
+ integer :: i, n
312
+ real :: weight_decay_l2_val
313
+ real :: weight_decay_decoupled_val
314
+
315
+ ! Set default values for weight_decay_l2
316
+ if (.not. present (weight_decay_l2)) then
317
+ weight_decay_l2_val = 0.0
318
+ else
319
+ weight_decay_l2_val = weight_decay_l2
320
+ end if
321
+
322
+ ! Set default values for weight_decay_decoupled
323
+ if (.not. present (weight_decay_decoupled)) then
324
+ weight_decay_decoupled_val = 0.0
325
+ else
326
+ weight_decay_decoupled_val = weight_decay_decoupled
327
+ end if
328
+
329
+ print ' (a)' , ' Adam optimizer'
330
+ print ' (34("-"))'
331
+
332
+ do n = 1 , num_epochs
333
+ do i = 1 , size (x)
334
+ call net % forward([x(i)])
335
+ call net % backward([y(i)])
336
+ end do
337
+
338
+ call net % update( &
339
+ adam( &
340
+ learning_rate= learning_rate, &
341
+ beta1= beta1, &
342
+ beta2= beta2, &
343
+ epsilon= epsilon, &
344
+ weight_decay_l2= weight_decay_l2_val, &
345
+ weight_decay_decoupled= weight_decay_decoupled_val &
346
+ ) &
347
+ )
348
+
349
+ if (mod (n, num_epochs / 10 ) == 0 ) then
350
+ ypred = [(net % predict([xtest(i)]), i = 1 , size (xtest))]
351
+ print ' ("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)' , &
352
+ n, num_epochs, sum ((ypred - ytest)** 2 ) / size (ytest)
353
+ end if
354
+
355
+ end do
356
+
357
+ print * , ' '
358
+
359
+ end subroutine adam_optimizer
360
+
296
361
subroutine shuffle (arr )
297
362
! Shuffle an array using the Fisher-Yates algorithm.
298
363
integer , intent (inout ) :: arr(:)
0 commit comments