@@ -123,10 +123,10 @@ non-scalar output.
123
123
optimizer.zero_grad()
124
124
scaled_loss = 0
125
125
for accumulated_step_i in range (N_STEPS ):
126
- out = model.forward()
127
- loss = ...
128
- loss.backward()
129
- scaled_loss += loss.item()
126
+ out = model.forward()
127
+ loss = ...
128
+ loss.backward()
129
+ scaled_loss += loss.item()
130
130
optimizer.step()
131
131
actual_loss = scaled_loss / N_STEPS
132
132
```
@@ -146,10 +146,12 @@ torch.backends.cudnn.benchmark = False
146
146
147
147
``` python
148
148
for child in model.children():
149
- for param in child.parameters():
150
- param.requires_grad = False
149
+ for param in child.parameters():
150
+ param.requires_grad = False
151
151
152
- optimizer = torch.optim.Adam(filter (lambda p : p.requires_grad, model.parameters()), lr = ... )
152
+ optimizer = torch.optim.Adam(
153
+ filter (lambda p : p.requires_grad, model.parameters()), lr = ...
154
+ )
153
155
```
154
156
155
157
### Save and load weights
@@ -175,42 +177,50 @@ new_model = nn.Sequential(*list(model.children())[:-1])
175
177
### Get number of parameters
176
178
177
179
``` python
178
- num_params = sum (p.numel() for p in model.parameters()) # Total parameters
179
- num_trainable_params = sum (p.numel() for p in model.parameters() if p.requires_grad) # Trainable parameters
180
+ num_params = sum (p.numel() for p in model.parameters()) # Total parameters
181
+ num_trainable_params = sum (
182
+ p.numel() for p in model.parameters() if p.requires_grad
183
+ ) # Trainable parameters
180
184
```
181
185
182
186
### No grad and inference_mode decorators
183
187
184
188
``` python
185
189
@torch.no_grad ()
186
190
def eval (model , data ):
187
- model.eval()
191
+ model.eval()
192
+
188
193
189
194
@torch.inference_mode ()
190
195
def eval (model , data ):
191
- model.eval()
196
+ model.eval()
192
197
```
193
198
194
199
### Gradient clipping
195
200
196
201
``` python
197
- torch.nn.utils.clip_grad_value_(parameters = model.parameters(), clip_value = 1 .)
202
+ torch.nn.utils.clip_grad_value_(parameters = model.parameters(), clip_value = 1.0 )
198
203
torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type = 2 )
199
204
```
200
205
201
206
### Remove bias weight decay
202
207
203
208
``` python
204
209
def add_weight_decay (net , l2_value , skip_list = ()):
205
- decay, no_decay = [], []
206
- for name, param in net.named_parameters():
207
- if not param.requires_grad:
208
- continue # frozen weights
209
- if len (param.shape) == 1 or name.endswith(" .bias" ) or name in skip_list:
210
- no_decay.append(param)
211
- else :
212
- decay.append(param)
213
- return [{' params' : no_decay, ' weight_decay' : 0 .}, {' params' : decay, ' weight_decay' : l2_value}]
210
+ decay, no_decay = [], []
211
+ for name, param in net.named_parameters():
212
+ if not param.requires_grad:
213
+ continue # frozen weights
214
+ if len (param.shape) == 1 or name.endswith(" .bias" ) or name in skip_list:
215
+ no_decay.append(param)
216
+ else :
217
+ decay.append(param)
218
+ return [
219
+ {" params" : no_decay, " weight_decay" : 0.0 },
220
+ {" params" : decay, " weight_decay" : l2_value},
221
+ ]
222
+
223
+
214
224
params = add_weight_decay(net, 2e-5 )
215
225
sgd = torch.optim.SGD(params, lr = 0.1 )
216
226
```
@@ -244,26 +254,29 @@ print(u.grad)
244
254
### Weight init
245
255
246
256
``` python
247
- def init_weights (net , init_type = ' normal' , gain = 0.02 ):
248
- def init_func (m ):
249
- if isinstance (m, (nn.Conv2d, nn.Linear)):
250
- if init_type == ' normal' :
251
- nn.init.normal_(m.weight.data, 0.0 , gain)
252
- elif init_type == ' xavier' :
253
- nn.init.xavier_normal_(m.weight.data, gain = gain)
254
- elif init_type == ' kaiming' :
255
- nn.init.kaiming_normal_(m.weight.data, a = 0 , mode = ' fan_in' )
256
- elif init_type == ' orthogonal' :
257
- nn.init.orthogonal_(m.weight.data, gain = gain)
258
- else :
259
- raise NotImplementedError (' initialization method [%s ] is not implemented' % init_type)
260
- if hasattr (m, ' bias' ) and m.bias is not None :
261
- nn.init.constant_(m.bias.data, 0.0 )
262
- elif isinstance (m, nn.BatchNorm2d):
263
- nn.init.normal_(m.weight.data, 1.0 , gain)
264
- nn.init.constant_(m.bias.data, 0.0 )
265
- print (' initialize network with %s ' % init_type)
266
- net.apply(init_func)
257
+ def init_weights (net , init_type = " normal" , gain = 0.02 ):
258
+ def init_func (m ):
259
+ if isinstance (m, (nn.Conv2d, nn.Linear)):
260
+ if init_type == " normal" :
261
+ nn.init.normal_(m.weight.data, 0.0 , gain)
262
+ elif init_type == " xavier" :
263
+ nn.init.xavier_normal_(m.weight.data, gain = gain)
264
+ elif init_type == " kaiming" :
265
+ nn.init.kaiming_normal_(m.weight.data, a = 0 , mode = " fan_in" )
266
+ elif init_type == " orthogonal" :
267
+ nn.init.orthogonal_(m.weight.data, gain = gain)
268
+ else :
269
+ raise NotImplementedError (
270
+ " initialization method [%s ] is not implemented" % init_type
271
+ )
272
+ if hasattr (m, " bias" ) and m.bias is not None :
273
+ nn.init.constant_(m.bias.data, 0.0 )
274
+ elif isinstance (m, nn.BatchNorm2d):
275
+ nn.init.normal_(m.weight.data, 1.0 , gain)
276
+ nn.init.constant_(m.bias.data, 0.0 )
277
+
278
+ print (" initialize network with %s " % init_type)
279
+ net.apply(init_func)
267
280
```
268
281
269
282
### Train/test/valid splits
@@ -282,7 +295,7 @@ losses.append(loss.item()) # good
282
295
### Copy an array
283
296
284
297
``` python
285
- a = torch.tensor([1 ., 2 ., 3 .])
298
+ a = torch.tensor([1.0 , 2.0 , 3.0 ])
286
299
b = a # WRONG: same reference
287
300
b = a.clone()
288
301
```
@@ -292,8 +305,8 @@ b = a.clone()
292
305
### Construct tensors directly on GPUs
293
306
294
307
``` python
295
- t = tensor.rand(2 ,2 ).cuda() # bad
296
- t = tensor.rand(2 ,2 , device = ' cuda' ) # good
308
+ t = tensor.rand(2 , 2 ).cuda() # bad
309
+ t = tensor.rand(2 , 2 , device = " cuda" ) # good
297
310
```
298
311
299
312
### Avoid CPU to GPU transfers or vice-versa
@@ -318,14 +331,14 @@ Set `torch.backends.cudnn.benchmark = True` Note that cudnn.benchmark will profi
318
331
For example, if we perform x.cos().cos(), usually we need to perform 4 global reads and writes.
319
332
320
333
``` python
321
- x1 = x.cos() # Read from x in global memory, write to x1
322
- x2 = x1.cos() # Read from x1 in global memory, write to x2
334
+ x1 = x.cos() # Read from x in global memory, write to x1
335
+ x2 = x1.cos() # Read from x1 in global memory, write to x2
323
336
```
324
337
325
338
But, with operator fusion, we only need 2 global memory reads and writes! So operator fusion will speed it up by 2x.
326
339
327
340
``` python
328
- x2 = x.cos().cos() # Read from x in global memory, write to x2
341
+ x2 = x.cos().cos() # Read from x in global memory, write to x2
329
342
```
330
343
331
344
### Gradient checkpointing
0 commit comments