Optimized FLUX compilation memory usage

cehongwang · cehongwang · commit 044acdfa7aeb · 2025-04-25T02:27:05.000Z
diff --git a/examples/apps/flux-demo.py b/examples/apps/flux-demo.py
@@ -10,7 +10,7 @@
     "black-forest-labs/FLUX.1-dev",
     torch_dtype=torch.float16,
 )
-pipe.to(DEVICE).to(torch.float16)
+pipe.to(torch.float16)
 backbone = pipe.transformer
 
 
@@ -44,10 +44,11 @@
     "immutable_weights": False,
     "enable_cuda_graph": True,
 }
-
+backbone.to(DEVICE)
 trt_gm = torch_tensorrt.MutableTorchTensorRTModule(backbone, **settings)
 trt_gm.set_expected_dynamic_shape_range((), dynamic_shapes)
 pipe.transformer = trt_gm
+pipe.to(DEVICE)
 
 
 def generate_image(prompt, inference_step, batch_size=2):
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -892,7 +892,7 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
         parse_graph_io(submodule, subgraph_data)
         dryrun_tracker.tensorrt_graph_count += 1
         dryrun_tracker.per_subgraph_data.append(subgraph_data)
-
+        torch.cuda.empty_cache()
         # Create TRT engines from submodule
         if not settings.dryrun:
             trt_module = convert_module(