Skip to content

Commit 005755e

Browse files
committed
Fixed comments
1 parent ef8288a commit 005755e

File tree

3 files changed

+192
-4
lines changed

3 files changed

+192
-4
lines changed

py/torch_tensorrt/dynamo/_compiler.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,7 @@ def compile(
500500
enable_weight_streaming (bool): Enable weight streaming.
501501
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
502502
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
503+
offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
503504
**kwargs: Any,
504505
Returns:
505506
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -678,17 +679,24 @@ def compile(
678679
)
679680

680681
gm = exported_program.module()
681-
# Move the weights in the state_dict to CPU
682682
logger.debug("Input graph: " + str(gm.graph))
683683

684684
# Apply lowering on the graph module
685685
gm = post_lowering(gm, settings)
686686
logger.debug("Lowered Input graph: " + str(gm.graph))
687+
688+
# Move the weights in the state_dict to CPU
687689
if offload_module_to_cpu:
688690
exported_program.module().to(CPU_DEVICE)
689691
logger.info(
690-
"The model is offloaded to CPU during compilation. If you want to keep the model on GPU, set offload_module_to_cpu=False."
692+
"The PyTorch model was moved to the CPU to allocate all GPU memory to TensorRT. To retain the model on the GPU, set offload_module_to_cpu=False"
691693
)
694+
else:
695+
remaining_memory, total_memory = torch.cuda.mem_get_info()
696+
if remaining_memory < total_memory // 2:
697+
logger.warning(
698+
"Remaining GPU memory may not be enough to compile the TensorRT engine for this model resulting in an OOM error, Consider setting offload_module_to_cpu=True"
699+
)
692700
trt_gm = compile_module(
693701
gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache
694702
)
@@ -833,7 +841,7 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
833841
str(name),
834842
str(submodule.graph),
835843
)
836-
submodule.to(torch.cuda.current_device())
844+
submodule.to(to_torch_device(settings.device))
837845
continue
838846

839847
if name not in submodule_node_dict:

tests/py/dynamo/models/test_export_serde.py

+113-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@
66
import torch
77
import torch_tensorrt as torchtrt
88
import torchvision.models as models
9-
from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity
9+
from torch_tensorrt.dynamo.utils import (
10+
COSINE_THRESHOLD,
11+
cosine_similarity,
12+
get_model_device,
13+
)
1014

1115
assertions = unittest.TestCase()
1216

@@ -283,6 +287,53 @@ def test_resnet18(ir):
283287
)
284288

285289

290+
@pytest.mark.unit
291+
def test_resnet18_cpu_offload(ir):
292+
"""
293+
This tests export save and load functionality on Resnet18 model
294+
"""
295+
model = models.resnet18().eval().cuda()
296+
input = torch.randn((1, 3, 224, 224)).to("cuda")
297+
298+
compile_spec = {
299+
"inputs": [
300+
torchtrt.Input(
301+
input.shape, dtype=torch.float, format=torch.contiguous_format
302+
)
303+
],
304+
"ir": ir,
305+
"min_block_size": 1,
306+
"cache_built_engines": False,
307+
"reuse_cached_engines": False,
308+
"offload_module_to_cpu": True,
309+
}
310+
311+
exp_program = torchtrt.dynamo.trace(model, **compile_spec)
312+
trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec)
313+
assertions.assertTrue(
314+
get_model_device(model).type == "cpu",
315+
msg="Model should be offloaded to CPU",
316+
)
317+
model.cuda()
318+
torchtrt.save(trt_module, trt_ep_path)
319+
320+
deser_trt_module = torchtrt.load(trt_ep_path).module()
321+
outputs_pyt = model(input)
322+
outputs_trt = trt_module(input)
323+
cos_sim = cosine_similarity(outputs_pyt, outputs_trt[0])
324+
assertions.assertTrue(
325+
cos_sim > COSINE_THRESHOLD,
326+
msg=f"test_resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
327+
)
328+
329+
outputs_trt_deser = deser_trt_module(input)
330+
cos_sim = cosine_similarity(outputs_pyt, outputs_trt_deser[0])
331+
assertions.assertTrue(
332+
cos_sim > COSINE_THRESHOLD,
333+
msg=f"test_resnet18 deserialized TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
334+
)
335+
336+
286337
@pytest.mark.unit
287338
def test_resnet18_dynamic(ir):
288339
"""
@@ -381,6 +432,67 @@ def forward(self, x):
381432
)
382433

383434

435+
@pytest.mark.unit
436+
def test_hybrid_conv_fallback_cpu_offload(ir):
437+
"""
438+
This tests export save and load functionality on a hybrid
439+
model where a conv (a weighted layer) has been forced to fallback to Pytorch.
440+
"""
441+
442+
class MyModule(torch.nn.Module):
443+
def __init__(self):
444+
super().__init__()
445+
self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True)
446+
self.relu = torch.nn.ReLU()
447+
448+
def forward(self, x):
449+
conv = self.conv(x)
450+
relu = self.relu(conv)
451+
mul = relu * 0.5
452+
return mul
453+
454+
model = MyModule().eval().cuda()
455+
input = torch.randn((1, 3, 224, 224)).to("cuda")
456+
457+
compile_spec = {
458+
"inputs": [
459+
torchtrt.Input(
460+
input.shape, dtype=torch.float, format=torch.contiguous_format
461+
)
462+
],
463+
"ir": ir,
464+
"min_block_size": 1,
465+
"torch_executed_ops": {"torch.ops.aten.convolution.default"},
466+
"cache_built_engines": False,
467+
"reuse_cached_engines": False,
468+
"offload_module_to_cpu": True,
469+
}
470+
471+
exp_program = torchtrt.dynamo.trace(model, **compile_spec)
472+
trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec)
473+
model.cuda()
474+
torchtrt.save(trt_module, trt_ep_path)
475+
476+
deser_trt_module = torchtrt.load(trt_ep_path).module()
477+
outputs_pyt = model(input)
478+
outputs_trt = trt_module(input)
479+
480+
for idx in range(len(outputs_pyt)):
481+
cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt[idx])
482+
assertions.assertTrue(
483+
cos_sim > COSINE_THRESHOLD,
484+
msg=f"test_hybrid_conv_fallback TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
485+
)
486+
487+
outputs_trt_deser = deser_trt_module(input)
488+
for idx in range(len(outputs_pyt)):
489+
cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt_deser[idx])
490+
assertions.assertTrue(
491+
cos_sim > COSINE_THRESHOLD,
492+
msg=f"test_hybrid_conv_fallback deserialized TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
493+
)
494+
495+
384496
@pytest.mark.unit
385497
def test_arange_export(ir):
386498
"""

tests/py/dynamo/models/test_model_refit.py

+68
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,74 @@ def forward(self, x):
492492
torch._dynamo.reset()
493493

494494

495+
@pytest.mark.unit
496+
def test_refit_multiple_engine_with_weightmap_cpu_offload():
497+
class net(nn.Module):
498+
def __init__(self):
499+
super().__init__()
500+
self.conv1 = nn.Conv2d(3, 12, 3, padding=1)
501+
self.bn = nn.BatchNorm2d(12)
502+
self.conv2 = nn.Conv2d(12, 12, 3, padding=1)
503+
self.fc1 = nn.Linear(12 * 56 * 56, 10)
504+
505+
def forward(self, x):
506+
x = self.conv1(x)
507+
x = F.relu(x)
508+
x = self.bn(x)
509+
x = F.max_pool2d(x, (2, 2))
510+
x = self.conv2(x)
511+
x = F.relu(x)
512+
x = F.max_pool2d(x, (2, 2))
513+
x = torch.flatten(x, 1)
514+
return self.fc1(x)
515+
516+
model = net().eval().to("cuda")
517+
model2 = net().eval().to("cuda")
518+
519+
inputs = [torch.randn((1, 3, 224, 224)).to("cuda")]
520+
enabled_precisions = {torch.float}
521+
debug = False
522+
min_block_size = 1
523+
use_python_runtime = False
524+
525+
exp_program = torch.export.export(model, tuple(inputs))
526+
exp_program2 = torch.export.export(model2, tuple(inputs))
527+
528+
torch_executed_ops = {"torch.ops.aten.convolution.default"}
529+
trt_gm = torchtrt.dynamo.compile(
530+
exp_program,
531+
tuple(inputs),
532+
use_python_runtime=use_python_runtime,
533+
enabled_precisions=enabled_precisions,
534+
debug=debug,
535+
min_block_size=min_block_size,
536+
immutable_weights=False,
537+
torch_executed_ops=torch_executed_ops,
538+
reuse_cached_engines=False,
539+
offload_module_to_cpu=True,
540+
)
541+
542+
new_trt_gm = refit_module_weights(
543+
compiled_module=trt_gm,
544+
new_weight_module=exp_program2,
545+
arg_inputs=inputs,
546+
use_weight_map_cache=True,
547+
)
548+
model2.cuda()
549+
# Check the output
550+
expected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(
551+
*inputs
552+
)
553+
for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
554+
assertions.assertTrue(
555+
torch.allclose(expected_output, refitted_output, 1e-2, 1e-2),
556+
"Refit Result is not correct. Refit failed",
557+
)
558+
# Clean up model env
559+
560+
torch._dynamo.reset()
561+
562+
495563
@unittest.skipIf(
496564
not torch_trt.ENABLED_FEATURES.torch_tensorrt_runtime,
497565
"TorchScript Frontend is not available",

0 commit comments

Comments
 (0)