Options for Stagger model loading for low memory systems

jjhursey · jjhursey · commit 2fd81b08f702 · 2025-05-22T11:04:01.000-04:00
* `--stagger_load` : (default: `0` off) Stagger model loading to avoid OOM issues on the host
 * `--stagger_update_lazyhandle` : (default: `0` off) Stagger update_lazyhandle to avoid OOM issues on the host
 * `--dist_timeout` : (default: either `10` for NCCL or `30` for others set by PyTorch) torch distributed timeout in minutes

Signed-off-by: Joshua Hursey &lt;jhursey@us.ibm.com&gt;
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -3,26 +3,42 @@
 import time
 from fms.utils.tokenizers import BaseTokenizer
 from fms.utils.generation import generate
-from aiu_fms_testing_utils.utils.aiu_setup import dprint
+from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, local_rank, world_size
 from typing import Optional, List, Tuple
 import os
 import requests
 import json
 import random
+import math
 
-def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, compile_dynamic_sendnn = False, **padding_kwargs):
+def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, compile_dynamic_sendnn = False, stagger_update_lazyhandle = 0, **padding_kwargs):
     import torch_sendnn
     dprint("AIU warmup")
-    pt_compile_model_time = time.time()
     extra_kwargs = {**padding_kwargs, "only_last_token": True}
     max_new_tokens_warmup = max_new_tokens
     if compile_dynamic_sendnn:
         max_new_tokens_warmup = 2
+
+    if stagger_update_lazyhandle > 0 and stagger_update_lazyhandle != world_size:
+        for _set in range( math.ceil(world_size / float(stagger_update_lazyhandle)) ):
+            if rank < (_set+1)*stagger_update_lazyhandle:
+                break
+            torch.distributed.barrier()
+        dprint(f"Stagger update_lazyhandle: Begin (Set: {_set+1} of {math.ceil(world_size / float(stagger_update_lazyhandle))})")
+
+    pt_compile_model_time = time.time()
     with torch_sendnn.warmup_mode():
         generate(model, input_ids, max_new_tokens=max_new_tokens_warmup, max_seq_len=model.config.max_expected_seq_len, use_cache=True, do_sample=False, contiguous_cache=True, extra_kwargs=extra_kwargs)
     pt_compile_model_time = time.time() - pt_compile_model_time
     dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s")
 
+    if stagger_update_lazyhandle > 0 and stagger_update_lazyhandle != world_size:
+        for _set in range( math.ceil(world_size / float(stagger_update_lazyhandle)) ):
+            if rank >= (_set+1)*stagger_update_lazyhandle:
+                continue
+            torch.distributed.barrier()
+        dprint(f"Stagger update_lazyhandle: All Complete")
+
 def ids_for_prompt(prompt, tokenizer):
     tokens = tokenizer.tokenize(prompt)
     ids = tokenizer.convert_tokens_to_ids(tokens)
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -1,5 +1,6 @@
 # Standard
 import argparse
+import datetime
 from functools import partial
 import itertools
 import json
@@ -8,6 +9,7 @@
 import random
 import time
 import contextlib
+import math
 
 # Third Party
 from aiu_fms_testing_utils.utils import aiu_setup, warmup_model
@@ -218,6 +220,24 @@
     default=0,
     help="Set verbosity level (pass flag as `-v`, `-vv`, `-vvv`)"
 )
+parser.add_argument(
+    "--stagger_load",
+    type=int,
+    default=0,
+    help="Stagger model loading to avoid OOM issues on the host"
+)
+parser.add_argument(
+    "--stagger_update_lazyhandle",
+    type=int,
+    default=0,
+    help="Stagger update_lazyhandle to avoid OOM issues on the host"
+)
+parser.add_argument(
+    "--dist_timeout",
+    type=int,
+    default=0,
+    help="Timeout to use for messaging in minutes. Default set by PyTorch dist.init_process_group"
+)
 args = parser.parse_args()
 
 if args.quantization == "gptq":
@@ -260,7 +280,13 @@
 is_aiu_backend = "aiu" in args.device_type
 
 if args.distributed:
-    dist.init_process_group()
+    if args.dist_timeout > 0:
+        # Default timeout:
+        # https://docs.pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group
+        dist.init_process_group(timeout=datetime.timedelta(minutes=args.dist_timeout))
+        dprint(f"NOTICE: init_process_group timeout set to {args.dist_timeout} minutes")
+    else:
+        dist.init_process_group()
     # Fix until PT 2.3
     torch._C._distributed_c10d._register_process_group("default", dist.group.WORLD)
     aiu_setup.aiu_dist_setup(dist.get_rank(), dist.get_world_size())
@@ -438,6 +464,13 @@ def select_int8_module(
 dprint(f"data_type={default_dtype}")
 dprint("="*60 + "\n")
 
+if args.stagger_load > 0 and args.stagger_load != world_size:
+    for _set in range( math.ceil(world_size / float(args.stagger_load)) ):
+        if rank < (_set+1)*args.stagger_load:
+            break
+        torch.distributed.barrier()
+    dprint(f"Stagger Model Load: Begin (Set: {_set+1} of {math.ceil(world_size / float(args.stagger_load))})")
+
 model = get_model(
     args.architecture,
     args.variant,
@@ -467,6 +500,13 @@ def select_int8_module(
 loading_model_time = time.time() - loading_model_time
 dprint(f"loading complete, took {loading_model_time:.3f}s")
 
+if args.stagger_load > 0 and args.stagger_load != world_size:
+    for _set in range( math.ceil(world_size / float(args.stagger_load)) ):
+        if rank >= (_set+1)*args.stagger_load:
+            continue
+        torch.distributed.barrier()
+    dprint(f"Stagger Model Load: All Complete")
+
 if args.compile:
     dprint("compiling model")
     if is_aiu_backend:
@@ -695,7 +735,7 @@ def infer(use_cache, do_sample, warmup):
     dprint(f"compilation warmup")
     pt_compile_model_time = time.time()
     if args.device_type == "aiu":  # only run warmup for AIU, no need for senulator
-        warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, **extra_generation_kwargs)
+        warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, args.stagger_update_lazyhandle, **extra_generation_kwargs)
         aiu_warmup_time = time.time()
         for sample, cache in itertools.product(do_sample, use_cache):
             infer(cache, sample, True)