@@ -369,13 +369,9 @@ def train(
369
369
eval_loss , eval_metric , step_loss , step_metric = evaluation_helper (
370
370
model , train_config , eval_dataloader , device
371
371
)
372
- # Print evaluation metrics
373
- print (
374
- f"Epoch { epoch + 1 } : Eval Loss: { eval_loss .detach ().cpu ():.4f} , Eval metric: { eval_metric .detach ().cpu ():.4f} "
375
- )
376
372
if eval_loss < best_val_loss :
377
373
best_val_loss = eval_loss
378
- print (f"best eval loss on epoch { epoch + 1 } is { best_val_loss :.4f} " )
374
+ print (f"Best eval loss on epoch { epoch + 1 } is { best_val_loss :.4f} " )
379
375
380
376
if is_rank_zero ():
381
377
tensorboard_updates .add_scalars ("loss" , {"eval" : eval_loss }, total_train_steps )
@@ -385,6 +381,16 @@ def train(
385
381
val_loss .append (float (eval_loss ))
386
382
val_metric .append (float (eval_metric ))
387
383
384
+ if train_config .enable_ddp :
385
+ dist .all_reduce (eval_loss , op = dist .ReduceOp .SUM )
386
+ eval_loss /= get_num_ddp_devices ()
387
+ dist .all_reduce (eval_metric , op = dist .ReduceOp .SUM )
388
+ eval_metric /= get_num_ddp_devices ()
389
+
390
+ print (
391
+ f"Epoch { epoch + 1 } : Eval Loss: { eval_loss .detach ().cpu ():.4f} , Eval metric: { eval_metric .detach ().cpu ():.4f} "
392
+ )
393
+
388
394
# saving the adapters after completion of each epoch
389
395
if train_config .save_model :
390
396
if train_config .enable_ddp :
@@ -507,12 +513,6 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
507
513
else :
508
514
eval_metric = torch .exp (eval_loss )
509
515
510
- if train_config .enable_ddp :
511
- dist .all_reduce (eval_loss , op = dist .ReduceOp .SUM )
512
- eval_loss /= get_num_ddp_devices ()
513
- dist .all_reduce (eval_metric , op = dist .ReduceOp .SUM )
514
- eval_metric /= get_num_ddp_devices ()
515
-
516
516
return eval_loss , eval_metric , val_step_loss , val_step_metric
517
517
518
518
0 commit comments