Fixed comments

cehongwang · cehongwang · commit 005755ea6fee · 2025-05-09T03:15:22.000Z
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -500,6 +500,7 @@ def compile(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
+        offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -678,17 +679,24 @@ def compile(
     )
 
     gm = exported_program.module()
-    # Move the weights in the state_dict to CPU
     logger.debug("Input graph: " + str(gm.graph))
 
     # Apply lowering on the graph module
     gm = post_lowering(gm, settings)
     logger.debug("Lowered Input graph: " + str(gm.graph))
+
+    # Move the weights in the state_dict to CPU
     if offload_module_to_cpu:
         exported_program.module().to(CPU_DEVICE)
         logger.info(
-            "The model is offloaded to CPU during compilation. If you want to keep the model on GPU, set offload_module_to_cpu=False."
+            "The PyTorch model was moved to the CPU to allocate all GPU memory to TensorRT. To retain the model on the GPU, set offload_module_to_cpu=False"
         )
+    else:
+        remaining_memory, total_memory = torch.cuda.mem_get_info()
+        if remaining_memory < total_memory // 2:
+            logger.warning(
+                "Remaining GPU memory may not be enough to compile the TensorRT engine for this model resulting in an OOM error, Consider setting offload_module_to_cpu=True"
+            )
     trt_gm = compile_module(
         gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache
     )
@@ -833,7 +841,7 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
                 str(name),
                 str(submodule.graph),
             )
-            submodule.to(torch.cuda.current_device())
+            submodule.to(to_torch_device(settings.device))
             continue
 
         if name not in submodule_node_dict:
diff --git a/tests/py/dynamo/models/test_export_serde.py b/tests/py/dynamo/models/test_export_serde.py
@@ -6,7 +6,11 @@
 import torch
 import torch_tensorrt as torchtrt
 import torchvision.models as models
-from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity
+from torch_tensorrt.dynamo.utils import (
+    COSINE_THRESHOLD,
+    cosine_similarity,
+    get_model_device,
+)
 
 assertions = unittest.TestCase()
 
@@ -283,6 +287,53 @@ def test_resnet18(ir):
     )
 
 
+@pytest.mark.unit
+def test_resnet18_cpu_offload(ir):
+    """
+    This tests export save and load functionality on Resnet18 model
+    """
+    model = models.resnet18().eval().cuda()
+    input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(
+                input.shape, dtype=torch.float, format=torch.contiguous_format
+            )
+        ],
+        "ir": ir,
+        "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+        "offload_module_to_cpu": True,
+    }
+
+    exp_program = torchtrt.dynamo.trace(model, **compile_spec)
+    trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec)
+    assertions.assertTrue(
+        get_model_device(model).type == "cpu",
+        msg="Model should be offloaded to CPU",
+    )
+    model.cuda()
+    torchtrt.save(trt_module, trt_ep_path)
+
+    deser_trt_module = torchtrt.load(trt_ep_path).module()
+    outputs_pyt = model(input)
+    outputs_trt = trt_module(input)
+    cos_sim = cosine_similarity(outputs_pyt, outputs_trt[0])
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"test_resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+    )
+
+    outputs_trt_deser = deser_trt_module(input)
+    cos_sim = cosine_similarity(outputs_pyt, outputs_trt_deser[0])
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"test_resnet18 deserialized TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+    )
+
+
 @pytest.mark.unit
 def test_resnet18_dynamic(ir):
     """
@@ -381,6 +432,67 @@ def forward(self, x):
         )
 
 
+@pytest.mark.unit
+def test_hybrid_conv_fallback_cpu_offload(ir):
+    """
+    This tests export save and load functionality on a hybrid
+    model where a conv (a weighted layer)  has been forced to fallback to Pytorch.
+    """
+
+    class MyModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True)
+            self.relu = torch.nn.ReLU()
+
+        def forward(self, x):
+            conv = self.conv(x)
+            relu = self.relu(conv)
+            mul = relu * 0.5
+            return mul
+
+    model = MyModule().eval().cuda()
+    input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(
+                input.shape, dtype=torch.float, format=torch.contiguous_format
+            )
+        ],
+        "ir": ir,
+        "min_block_size": 1,
+        "torch_executed_ops": {"torch.ops.aten.convolution.default"},
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+        "offload_module_to_cpu": True,
+    }
+
+    exp_program = torchtrt.dynamo.trace(model, **compile_spec)
+    trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec)
+    model.cuda()
+    torchtrt.save(trt_module, trt_ep_path)
+
+    deser_trt_module = torchtrt.load(trt_ep_path).module()
+    outputs_pyt = model(input)
+    outputs_trt = trt_module(input)
+
+    for idx in range(len(outputs_pyt)):
+        cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt[idx])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_hybrid_conv_fallback TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+    outputs_trt_deser = deser_trt_module(input)
+    for idx in range(len(outputs_pyt)):
+        cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt_deser[idx])
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"test_hybrid_conv_fallback deserialized TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+
 @pytest.mark.unit
 def test_arange_export(ir):
     """
diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py
@@ -492,6 +492,74 @@ def forward(self, x):
     torch._dynamo.reset()
 
 
+@pytest.mark.unit
+def test_refit_multiple_engine_with_weightmap_cpu_offload():
+    class net(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = nn.Conv2d(3, 12, 3, padding=1)
+            self.bn = nn.BatchNorm2d(12)
+            self.conv2 = nn.Conv2d(12, 12, 3, padding=1)
+            self.fc1 = nn.Linear(12 * 56 * 56, 10)
+
+        def forward(self, x):
+            x = self.conv1(x)
+            x = F.relu(x)
+            x = self.bn(x)
+            x = F.max_pool2d(x, (2, 2))
+            x = self.conv2(x)
+            x = F.relu(x)
+            x = F.max_pool2d(x, (2, 2))
+            x = torch.flatten(x, 1)
+            return self.fc1(x)
+
+    model = net().eval().to("cuda")
+    model2 = net().eval().to("cuda")
+
+    inputs = [torch.randn((1, 3, 224, 224)).to("cuda")]
+    enabled_precisions = {torch.float}
+    debug = False
+    min_block_size = 1
+    use_python_runtime = False
+
+    exp_program = torch.export.export(model, tuple(inputs))
+    exp_program2 = torch.export.export(model2, tuple(inputs))
+
+    torch_executed_ops = {"torch.ops.aten.convolution.default"}
+    trt_gm = torchtrt.dynamo.compile(
+        exp_program,
+        tuple(inputs),
+        use_python_runtime=use_python_runtime,
+        enabled_precisions=enabled_precisions,
+        debug=debug,
+        min_block_size=min_block_size,
+        immutable_weights=False,
+        torch_executed_ops=torch_executed_ops,
+        reuse_cached_engines=False,
+        offload_module_to_cpu=True,
+    )
+
+    new_trt_gm = refit_module_weights(
+        compiled_module=trt_gm,
+        new_weight_module=exp_program2,
+        arg_inputs=inputs,
+        use_weight_map_cache=True,
+    )
+    model2.cuda()
+    # Check the output
+    expected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(
+        *inputs
+    )
+    for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
+        assertions.assertTrue(
+            torch.allclose(expected_output, refitted_output, 1e-2, 1e-2),
+            "Refit Result is not correct. Refit failed",
+        )
+        # Clean up model env
+
+    torch._dynamo.reset()
+
+
 @unittest.skipIf(
     not torch_trt.ENABLED_FEATURES.torch_tensorrt_runtime,
     "TorchScript Frontend is not available",