Skip to content

Commit

Permalink
[QOL] Log the nodelist. (microsoft#291)
Browse files Browse the repository at this point in the history
* [QOL] Log the nodelist.

* Tweak.

* Tweak

* Tweak.

* lint
  • Loading branch information
stephenroller authored Aug 20, 2022
1 parent df3d814 commit d44a38f
Showing 1 changed file with 10 additions and 3 deletions.
13 changes: 10 additions & 3 deletions metaseq/distributed/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import socket
import struct
import subprocess
import warnings
from argparse import Namespace
from collections import OrderedDict
from dataclasses import dataclass
Expand Down Expand Up @@ -124,10 +123,13 @@ def distributed_init(cfg: MetaseqConfig):

cfg = convert_namespace_to_omegaconf(cfg)

# silence torch's distributed initialization info
logging.getLogger("torch.distributed.distributed_c10d").setLevel(logging.WARNING)

if torch.distributed.is_available() and torch.distributed.is_initialized():
warnings.warn("Distributed is already initialized, cannot initialize twice!")
logger.warning("Distributed is already initialized, cannot initialize twice!")
else:
logger.info(
logger.debug(
"distributed init (rank {}): {}".format(
cfg.distributed_training.distributed_rank,
cfg.distributed_training.distributed_init_method,
Expand All @@ -152,11 +154,16 @@ def distributed_init(cfg: MetaseqConfig):

cfg.distributed_training.distributed_rank = torch.distributed.get_rank()

# set global log level
if is_master(cfg.distributed_training):
logging.getLogger().setLevel(logging.INFO)
else:
logging.getLogger().setLevel(logging.WARNING)

nodelist = os.environ.get("SLURM_STEP_NODELIST")
if nodelist:
logger.info(f"SLURM nodelist: {nodelist}")

if cfg.common.model_parallel_size > 1:
try:
from megatron.mpu import (
Expand Down

0 comments on commit d44a38f

Please sign in to comment.