@@ -30,11 +30,12 @@ impure elemental subroutine init(self, num_params)
30
30
integer , intent (in ) :: num_params
31
31
end subroutine init
32
32
33
- pure subroutine minimize (self , param , gradient )
33
+ pure subroutine minimize (self , weights , biases , gradient )
34
34
import :: optimizer_base_type
35
35
class(optimizer_base_type), intent (inout ) :: self
36
- real , intent (inout ) :: param(:)
37
- real , intent (in ) :: gradient(:)
36
+ real , intent (inout ), pointer :: weights(:)
37
+ real , intent (inout ), pointer :: biases(:)
38
+ real , intent (in ), pointer :: gradient(:)
38
39
end subroutine minimize
39
40
40
41
end interface
@@ -116,27 +117,32 @@ impure elemental subroutine init_sgd(self, num_params)
116
117
end subroutine init_sgd
117
118
118
119
119
- pure subroutine minimize_sgd (self , param , gradient )
120
+ pure subroutine minimize_sgd (self , weights , biases , gradient )
120
121
! ! Concrete implementation of a stochastic gradient descent optimizer
121
122
! ! update rule.
122
123
class(sgd), intent (inout ) :: self
123
- real , intent (inout ) :: param(:)
124
- real , intent (in ) :: gradient(:)
124
+ real , intent (inout ), pointer :: weights(:)
125
+ real , intent (inout ), pointer :: biases(:)
126
+ real , intent (in ), pointer :: gradient(:)
125
127
126
128
if (self % momentum > 0 ) then
127
129
! Apply momentum update
128
130
self % velocity = self % momentum * self % velocity &
129
131
- self % learning_rate * gradient
130
132
if (self % nesterov) then
131
133
! Apply Nesterov update
132
- param = param + self % momentum * self % velocity &
134
+ weights = weights + self % momentum * self % velocity &
135
+ - self % learning_rate * gradient
136
+ biases = biases + self % momentum * self % velocity &
133
137
- self % learning_rate * gradient
134
138
else
135
- param = param + self % velocity
139
+ weights = weights + self % velocity
140
+ biases = biases + self % velocity
136
141
end if
137
142
else
138
143
! Apply regular update
139
- param = param - self % learning_rate * gradient
144
+ weights = weights - self % learning_rate * gradient
145
+ biases = biases - self % learning_rate * gradient
140
146
end if
141
147
142
148
end subroutine minimize_sgd
@@ -152,18 +158,21 @@ impure elemental subroutine init_rmsprop(self, num_params)
152
158
end subroutine init_rmsprop
153
159
154
160
155
- pure subroutine minimize_rmsprop (self , param , gradient )
161
+ pure subroutine minimize_rmsprop (self , weights , biases , gradient )
156
162
! ! Concrete implementation of a RMSProp optimizer update rule.
157
163
class(rmsprop), intent (inout ) :: self
158
- real , intent (inout ) :: param(:)
159
- real , intent (in ) :: gradient(:)
164
+ real , intent (inout ), pointer :: weights(:)
165
+ real , intent (inout ), pointer :: biases(:)
166
+ real , intent (in ), pointer :: gradient(:)
160
167
161
168
! Compute the RMS of the gradient using the RMSProp rule
162
169
self % rms_gradient = self % decay_rate * self % rms_gradient &
163
170
+ (1 - self % decay_rate) * gradient** 2
164
171
165
172
! Update the network parameters based on the new RMS of the gradient
166
- param = param - self % learning_rate &
173
+ weights = weights - self % learning_rate &
174
+ / sqrt (self % rms_gradient + self % epsilon) * gradient
175
+ biases = biases - self % learning_rate &
167
176
/ sqrt (self % rms_gradient + self % epsilon) * gradient
168
177
169
178
end subroutine minimize_rmsprop
@@ -180,17 +189,18 @@ impure elemental subroutine init_adam(self, num_params)
180
189
end subroutine init_adam
181
190
182
191
183
- pure subroutine minimize_adam (self , param , gradient )
192
+ pure subroutine minimize_adam (self , weights , biases , gradient )
184
193
! ! Concrete implementation of an Adam optimizer update rule.
185
194
class(adam), intent (inout ) :: self
186
- real , intent (inout ) :: param(:)
187
- real , intent (in ) :: gradient(:)
195
+ real , intent (inout ), pointer :: weights(:)
196
+ real , intent (inout ), pointer :: biases(:)
197
+ real , intent (in ), pointer :: gradient(:)
188
198
189
199
self % t = self % t + 1
190
200
191
201
! If weight_decay_l2 > 0, use L2 regularization;
192
202
! otherwise, default to regular Adam.
193
- associate(g = > gradient + self % weight_decay_l2 * param )
203
+ associate(g = > gradient + self % weight_decay_l2 * weights )
194
204
self % m = self % beta1 * self % m + (1 - self % beta1) * g
195
205
self % v = self % beta2 * self % v + (1 - self % beta2) * g** 2
196
206
end associate
@@ -202,9 +212,15 @@ pure subroutine minimize_adam(self, param, gradient)
202
212
)
203
213
204
214
! Update parameters.
205
- param = param &
215
+ weights = weights &
206
216
- self % learning_rate * (m_hat / (sqrt (v_hat) + self % epsilon) &
207
- + self % weight_decay_decoupled * param)
217
+ + self % weight_decay_decoupled * weights)
218
+
219
+ ! Update biases (without weight decay for biases)
220
+ associate(g = > gradient)
221
+ biases = biases &
222
+ - self % learning_rate * (m_hat / (sqrt (v_hat) + self % epsilon))
223
+ end associate
208
224
209
225
end associate
210
226
@@ -221,30 +237,42 @@ impure elemental subroutine init_adagrad(self, num_params)
221
237
end subroutine init_adagrad
222
238
223
239
224
- pure subroutine minimize_adagrad (self , param , gradient )
240
+ pure subroutine minimize_adagrad (self , weights , biases , gradient )
225
241
! ! Concrete implementation of an Adagrad optimizer update rule.
226
242
class(adagrad), intent (inout ) :: self
227
- real , intent (inout ) :: param(:)
228
- real , intent (in ) :: gradient(:)
243
+ real , intent (inout ), pointer :: weights(:)
244
+ real , intent (inout ), pointer :: biases(:)
245
+ real , intent (in ), pointer :: gradient(:)
229
246
230
247
! Update the current time step
231
248
self % t = self % t + 1
232
249
250
+ ! For weights
233
251
associate( &
234
252
! If weight_decay_l2 > 0, use L2 regularization;
235
253
! otherwise, default to regular Adagrad.
236
- g = > gradient + self % weight_decay_l2 * param , &
254
+ g = > gradient + self % weight_decay_l2 * weights , &
237
255
! Amortize the learning rate as function of the current time step.
238
256
learning_rate = > self % learning_rate &
239
257
/ (1 + (self % t - 1 ) * self % learning_rate_decay) &
240
258
)
241
259
242
260
self % sum_squared_gradient = self % sum_squared_gradient + g** 2
243
261
244
- param = param - learning_rate * g / (sqrt (self % sum_squared_gradient) &
262
+ weights = weights - learning_rate * g / (sqrt (self % sum_squared_gradient) &
245
263
+ self % epsilon)
246
264
247
265
end associate
266
+
267
+ ! For biases (without weight decay)
268
+ associate( &
269
+ g = > gradient, &
270
+ learning_rate = > self % learning_rate &
271
+ / (1 + (self % t - 1 ) * self % learning_rate_decay) &
272
+ )
273
+ biases = biases - learning_rate * g / (sqrt (self % sum_squared_gradient) &
274
+ + self % epsilon)
275
+ end associate
248
276
249
277
end subroutine minimize_adagrad
250
278
0 commit comments