[Enhance] Set max queue size for async saving hf checkpoint

HAOCHENYE · HAOCHENYE · commit 57dd84b9eb8f · 2025-11-10T08:49:59.000Z
`xtuner.v1.model.BaseModel.save_hf` using `ProcessPoolExecutor` to
submit multiple saving tasks to accelerate the save speed. However, `ProcessPoolExecutor.submit` is nonblocking, it will accumulated lots of `cpu` tensor and will cause cpu oom.
diff --git a/ci/scripts/save_hf_test.py b/ci/scripts/save_hf_test.py
@@ -0,0 +1,74 @@
+import argparse
+import time
+import torch
+import torch.distributed as dist
+from xtuner.v1.model import get_model_config_from_hf
+from xtuner.v1.config import FSDPConfig
+
+from memory_profiler import profile
+
+MB = 1024 ** 2
+
+def get_args():
+    p = argparse.ArgumentParser("Profile build/shard/save with @profile (RSS) and simple GPU stats")
+    p.add_argument("hf_path", type=str, help="HF model path")
+    p.add_argument("out", type=str, help="Output HF path")
+    p.add_argument("--ep", type=int, default=1, help="expert parallel size")
+    return p.parse_args()
+
+def set_device_for_rank():
+    if torch.cuda.is_available():
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+
+def gpu_mem(label):
+    if not torch.cuda.is_available():
+        print(f"[GPU] {label}: no CUDA")
+        return
+    torch.cuda.synchronize()
+    alloc = torch.cuda.memory_allocated() / MB
+    reserved = torch.cuda.memory_reserved() / MB
+    peak = torch.cuda.max_memory_allocated() / MB
+    print(f"[GPU] {label}: alloc={alloc:.2f}MB reserved={reserved:.2f}MB peak={peak:.2f}MB")
+
+def build_model(hf_path: str):
+    cfg = get_model_config_from_hf(hf_path)
+    model = cfg.build()
+    return model
+
+def shard_model(model, ep: int):
+    fsdp_cfg = FSDPConfig(ep_size=ep)
+    model.fully_shard(fsdp_config=fsdp_cfg)
+    return model
+
+@profile
+def save_model(model, out: str):
+    model.save_hf(out)
+
+def main():
+    args = get_args()
+
+    dist.init_process_group(backend="nccl")
+    set_device_for_rank()
+
+    t0 = time.perf_counter()
+    gpu_mem("init")
+
+    torch.cuda.reset_peak_memory_stats()
+    model = build_model(args.hf_path)
+    gpu_mem("after_build")
+
+    torch.cuda.reset_peak_memory_stats()
+    shard_model(model, args.ep)
+    gpu_mem("after_shard")
+
+    torch.cuda.reset_peak_memory_stats()
+    save_model(model, args.out)
+    gpu_mem("after_save")
+
+    print(f"[TIME] total={time.perf_counter()-t0:.3f}s")
+
+    dist.destroy_process_group()
+
+if __name__ == "__main__":
+    main()
diff --git a/xtuner/v1/model/base.py b/xtuner/v1/model/base.py
@@ -1,6 +1,6 @@
 import json
 import math
-from concurrent.futures import ProcessPoolExecutor, wait
+from concurrent.futures import Future, ProcessPoolExecutor, wait
 from functools import reduce
 from itertools import chain
 from pathlib import Path
@@ -73,6 +73,7 @@ class TransformerConfig(PydanticBaseModel):
     use_sliding_window: Annotated[bool, Parameter(group="model")] = False
     max_window_layers: Annotated[int | None, Parameter(group="model")] = None
     rope_scaling_cfg: RopeScalingConfig | None = None
+    hf_save_worker: Annotated[int, Parameter(group="model")] = 16
 
     @computed_field
     def num_attention_heads(self) -> int:
@@ -719,6 +720,7 @@ def _save_hf(self, hf_dir: Path | str, save_dtype: torch.dtype = torch.bfloat16)
                     hf_dir / safetensor_name,
                 )
                 save_futures.append(future)
+                self._wait_save_task(save_futures)
 
         safetensor_index = 0
         for name_list, hf_tensor_list in chain(same_gen, shard_gen):
@@ -742,6 +744,7 @@ def _save_hf(self, hf_dir: Path | str, save_dtype: torch.dtype = torch.bfloat16)
                     hf_dir / safetensor_name,
                 )
                 save_futures.append(future)
+                self._wait_save_task(save_futures)
 
         if save_executor is not None:
             wait(save_futures)
@@ -1103,3 +1106,15 @@ def _to_empty_meta(self):
                 module.to_empty(device=self.device, recurse=False)
         DEVICE_MODULE.synchronize()
         return
+
+    def _wait_save_task(self, tasks: list[Future]):
+        "Limit the number of concurrent save tasks to avoid OOM."
+        if len(tasks) >= self.config.hf_save_worker:
+            done, pending = wait(tasks)
+            for future in done:
+                if (exception := future.exception()) is not None:
+                    raise exception
+            tasks.clear()
+            tasks.extend(pending)
+        else:
+            return
diff --git a/xtuner/v1/model/compose/intern_s1/intern_s1_config.py b/xtuner/v1/model/compose/intern_s1/intern_s1_config.py
@@ -93,6 +93,7 @@ class InternS1BaseConfig(BaseModel):
     freeze_vision: bool = False
     freeze_projector: bool = False
     freeze_language: bool = False
+    hf_save_worker: int = 16
 
     def build(self) -> "InternS1ForConditionalGeneration":
         from .modeling_intern_s1 import InternS1ForConditionalGeneration
diff --git a/xtuner/v1/model/compose/internvl/internvl_config.py b/xtuner/v1/model/compose/internvl/internvl_config.py
@@ -91,6 +91,7 @@ class InternVLBaseConfig(BaseModel):
     freeze_vision: bool = False
     freeze_projector: bool = False
     freeze_language: bool = False
+    hf_save_worker: int = 16
 
     def build(self) -> "InternVLForConditionalGeneration":
         from .modeling_internvl import InternVLForConditionalGeneration
diff --git a/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py b/xtuner/v1/model/compose/qwen3_vl/qwen3_vl_config.py
@@ -78,6 +78,7 @@ class Qwen3VLBaseConfig(BaseModel):
     freeze_vision: bool = False
     freeze_projector: bool = False
     freeze_language: bool = False
+    hf_save_worker: int = 16
 
     def build(self):
         from .modeling_qwen3_vl import Qwen3VLForConditionalGeneration