Skip to content

Commit c66c08a

Browse files
committed
TensorRT-LLM import fix and aot_joint_export specify as explicit setting in dynamo.compile
TRT-LLM installation utilities and adding test cases adding the option in _compiler.py changes in the TRT-LLM loading tool- removing install_wget, install_unzip, install_mpi Further changes in error logging of the TRT-LLM installation tool moving the load_tensorrt_llm to dynamo/utils.py correcting misprint for TRT LLM load Using python lib for download to make it platform agnostic dll file path update for windows correcting the non critical lint error Including version in versions.txt
1 parent ab9e309 commit c66c08a

File tree

6 files changed

+149
-67
lines changed

6 files changed

+149
-67
lines changed

dev_dep_versions.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
__cuda_version__: "12.8"
22
__tensorrt_version__: "10.11.0"
3+
__tensorrt_llm_version__: "0.17.0.post1"

py/torch_tensorrt/dynamo/_compiler.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def cross_compile_for_windows(
103103
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
104104
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
105105
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
106+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
106107
**kwargs: Any,
107108
) -> torch.fx.GraphModule:
108109
"""Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -177,6 +178,7 @@ def cross_compile_for_windows(
177178
enable_weight_streaming (bool): Enable weight streaming.
178179
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
179180
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
181+
use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
180182
**kwargs: Any,
181183
Returns:
182184
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -339,6 +341,7 @@ def cross_compile_for_windows(
339341
"enable_weight_streaming": enable_weight_streaming,
340342
"tiling_optimization_level": tiling_optimization_level,
341343
"l2_limit_for_tiling": l2_limit_for_tiling,
344+
"use_distributed_mode_trace": use_distributed_mode_trace,
342345
}
343346

344347
# disable the following settings is not supported for cross compilation for windows feature
@@ -439,6 +442,7 @@ def compile(
439442
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
440443
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
441444
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
445+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
442446
**kwargs: Any,
443447
) -> torch.fx.GraphModule:
444448
"""Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -515,7 +519,11 @@ def compile(
515519
enable_weight_streaming (bool): Enable weight streaming.
516520
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
517521
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
522+
<<<<<<< HEAD
518523
offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
524+
=======
525+
use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
526+
>>>>>>> c3b62d239 (TensorRT-LLM import fix and aot_joint_export specify as explicit setting in dynamo.compile)
519527
**kwargs: Any,
520528
Returns:
521529
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -688,6 +696,7 @@ def compile(
688696
"tiling_optimization_level": tiling_optimization_level,
689697
"l2_limit_for_tiling": l2_limit_for_tiling,
690698
"offload_module_to_cpu": offload_module_to_cpu,
699+
"use_distributed_mode_trace": use_distributed_mode_trace,
691700
}
692701

693702
settings = CompilationSettings(**compilation_options)
@@ -1029,6 +1038,7 @@ def convert_exported_program_to_serialized_trt_engine(
10291038
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
10301039
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
10311040
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
1041+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
10321042
**kwargs: Any,
10331043
) -> bytes:
10341044
"""Convert an ExportedProgram to a serialized TensorRT engine
@@ -1093,6 +1103,7 @@ def convert_exported_program_to_serialized_trt_engine(
10931103
enable_weight_streaming (bool): Enable weight streaming.
10941104
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
10951105
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
1106+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
10961107
Returns:
10971108
bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
10981109
"""
@@ -1215,6 +1226,7 @@ def convert_exported_program_to_serialized_trt_engine(
12151226
"tiling_optimization_level": tiling_optimization_level,
12161227
"l2_limit_for_tiling": l2_limit_for_tiling,
12171228
"offload_module_to_cpu": offload_module_to_cpu,
1229+
"use_distributed_mode_trace": use_distributed_mode_trace,
12181230
}
12191231

12201232
settings = CompilationSettings(**compilation_options)

py/torch_tensorrt/dynamo/conversion/converter_utils.py

Lines changed: 2 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import collections
2-
import ctypes
32
import functools
43
import logging
5-
import os
64
from typing import (
75
Any,
86
Callable,
@@ -24,6 +22,7 @@
2422
from torch.fx.node import Argument, Target
2523
from torch.fx.passes.shape_prop import TensorMetadata
2624
from torch_tensorrt import _enums
25+
from torch_tensorrt._enums import Platform
2726
from torch_tensorrt.dynamo._settings import CompilationSettings
2827
from torch_tensorrt.dynamo._SourceIR import SourceIR
2928
from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
@@ -1116,69 +1115,6 @@ def args_bounds_check(
11161115
return args[i] if len(args) > i and args[i] is not None else replacement
11171116

11181117

1119-
def load_tensorrt_llm() -> bool:
1120-
"""
1121-
Attempts to load the TensorRT-LLM plugin and initialize it.
1122-
1123-
Returns:
1124-
bool: True if the plugin was successfully loaded and initialized, False otherwise.
1125-
"""
1126-
try:
1127-
import tensorrt_llm as trt_llm # noqa: F401
1128-
1129-
_LOGGER.info("TensorRT-LLM successfully imported")
1130-
return True
1131-
except (ImportError, AssertionError) as e_import_error:
1132-
# Check for environment variable for the plugin library path
1133-
plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
1134-
if not plugin_lib_path:
1135-
_LOGGER.warning(
1136-
"TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops",
1137-
)
1138-
return False
1139-
1140-
_LOGGER.info(f"TensorRT-LLM Plugin lib path found: {plugin_lib_path}")
1141-
try:
1142-
# Load the shared library
1143-
handle = ctypes.CDLL(plugin_lib_path)
1144-
_LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
1145-
except OSError as e_os_error:
1146-
_LOGGER.error(
1147-
f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
1148-
f"Ensure the path is correct and the library is compatible",
1149-
exc_info=e_os_error,
1150-
)
1151-
return False
1152-
1153-
try:
1154-
# Configure plugin initialization arguments
1155-
handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
1156-
handle.initTrtLlmPlugins.restype = ctypes.c_bool
1157-
except AttributeError as e_plugin_unavailable:
1158-
_LOGGER.warning(
1159-
"Unable to initialize the TensorRT-LLM plugin library",
1160-
exc_info=e_plugin_unavailable,
1161-
)
1162-
return False
1163-
1164-
try:
1165-
# Initialize the plugin
1166-
TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
1167-
if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
1168-
_LOGGER.info("TensorRT-LLM plugin successfully initialized")
1169-
return True
1170-
else:
1171-
_LOGGER.warning("TensorRT-LLM plugin library failed in initialization")
1172-
return False
1173-
except Exception as e_initialization_error:
1174-
_LOGGER.warning(
1175-
"Exception occurred during TensorRT-LLM plugin library initialization",
1176-
exc_info=e_initialization_error,
1177-
)
1178-
return False
1179-
return False
1180-
1181-
11821118
def promote_trt_tensors_to_same_dtype(
11831119
ctx: ConversionContext, lhs: TRTTensor, rhs: TRTTensor, name_prefix: str
11841120
) -> tuple[TRTTensor, TRTTensor]:
@@ -1216,3 +1152,4 @@ def promote_trt_tensors_to_same_dtype(
12161152
rhs_cast = cast_trt_tensor(ctx, rhs, promoted_dtype, f"{name_prefix}rhs_cast")
12171153

12181154
return lhs_cast, rhs_cast
1155+

py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111
from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
1212
dynamo_tensorrt_converter,
1313
)
14-
from torch_tensorrt.dynamo.conversion.converter_utils import load_tensorrt_llm
1514
from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
1615
tensorrt_fused_nccl_all_gather_op,
1716
tensorrt_fused_nccl_reduce_scatter_op,
1817
)
18+
from torch_tensorrt.dynamo.utils import load_tensorrt_llm
1919

2020
_LOGGER: logging.Logger = logging.getLogger(__name__)
2121

py/torch_tensorrt/dynamo/utils.py

Lines changed: 129 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
from __future__ import annotations
22

3+
import ctypes
34
import gc
45
import logging
6+
import os
7+
import urllib.request
58
import warnings
69
from dataclasses import fields, replace
710
from enum import Enum
@@ -14,9 +17,10 @@
1417
from torch._subclasses.fake_tensor import FakeTensor
1518
from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
1619
from torch_tensorrt._Device import Device
17-
from torch_tensorrt._enums import dtype
20+
from torch_tensorrt._enums import Platform, dtype
1821
from torch_tensorrt._features import ENABLED_FEATURES
1922
from torch_tensorrt._Input import Input
23+
from torch_tensorrt._version import __tensorrt_llm_version__
2024
from torch_tensorrt.dynamo import _defaults
2125
from torch_tensorrt.dynamo._defaults import default_device
2226
from torch_tensorrt.dynamo._engine_cache import BaseEngineCache
@@ -820,3 +824,127 @@ def is_tegra_platform() -> bool:
820824
if torch.cuda.get_device_capability() in [(8, 7), (7, 2)]:
821825
return True
822826
return False
827+
828+
829+
def download_plugin_lib_path(py_version: str, platform: str) -> str:
830+
plugin_lib_path = None
831+
832+
# Downloading TRT-LLM lib
833+
base_url = "https://pypi.nvidia.com/tensorrt-llm/"
834+
file_name = f"tensorrt_llm-{__tensorrt_llm_version__}-{py_version}-{py_version}-{platform}.whl"
835+
download_url = base_url + file_name
836+
if not (os.path.exists(file_name)):
837+
try:
838+
logger.debug(f"Downloading {download_url} ...")
839+
urllib.request.urlretrieve(download_url, file_name)
840+
logger.debug("Download succeeded and TRT-LLM wheel is now present")
841+
except urllib.error.HTTPError as e:
842+
logger.error(
843+
f"HTTP error {e.code} when trying to download {download_url}: {e.reason}"
844+
)
845+
except urllib.error.URLError as e:
846+
logger.error(
847+
f"URL error when trying to download {download_url}: {e.reason}"
848+
)
849+
except OSError as e:
850+
logger.error(f"Local file write error: {e}")
851+
852+
# Proceeding with the unzip of the wheel file
853+
# This will exist if the filename was already downloaded
854+
if "linux" in platform:
855+
lib_filename = "libnvinfer_plugin_tensorrt_llm.so"
856+
else:
857+
lib_filename = "libnvinfer_plugin_tensorrt_llm.dll"
858+
plugin_lib_path = os.path.join("./tensorrt_llm/libs", lib_filename)
859+
if os.path.exists(plugin_lib_path):
860+
return plugin_lib_path
861+
try:
862+
import zipfile
863+
except ImportError as e:
864+
raise ImportError(
865+
"zipfile module is required but not found. Please install zipfile"
866+
)
867+
with zipfile.ZipFile(file_name, "r") as zip_ref:
868+
zip_ref.extractall(".") # Extract to a folder named 'tensorrt_llm'
869+
plugin_lib_path = "./tensorrt_llm/libs/" + lib_filename
870+
return plugin_lib_path
871+
872+
873+
def load_tensorrt_llm() -> bool:
874+
"""
875+
Attempts to load the TensorRT-LLM plugin and initialize it.
876+
Either the env variable TRTLLM_PLUGINS_PATH can specify the path
877+
Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it
878+
879+
Returns:
880+
bool: True if the plugin was successfully loaded and initialized, False otherwise.
881+
"""
882+
plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
883+
if not plugin_lib_path:
884+
# this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
885+
use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
886+
"1",
887+
"true",
888+
"yes",
889+
"on",
890+
)
891+
if not use_trtllm_plugin:
892+
logger.warning(
893+
"Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT"
894+
)
895+
return False
896+
else:
897+
# this is used as the default py version
898+
py_version = "cp310"
899+
platform = Platform.current_platform()
900+
901+
platform = str(platform).lower()
902+
plugin_lib_path = download_plugin_lib_path(py_version, platform)
903+
904+
try:
905+
# Load the shared TRT-LLM file
906+
handle = ctypes.CDLL(plugin_lib_path)
907+
logger.info(f"Successfully loaded plugin library: {plugin_lib_path}")
908+
except OSError as e_os_error:
909+
if "libmpi" in str(e_os_error):
910+
logger.warning(
911+
f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. "
912+
f"The dependency libmpi.so is missing. "
913+
f"Please install the packages libmpich-dev and libopenmpi-dev.",
914+
exc_info=e_os_error,
915+
)
916+
else:
917+
logger.warning(
918+
f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
919+
f"Ensure the path is correct and the library is compatible",
920+
exc_info=e_os_error,
921+
)
922+
return False
923+
924+
try:
925+
# Configure plugin initialization arguments
926+
handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
927+
handle.initTrtLlmPlugins.restype = ctypes.c_bool
928+
except AttributeError as e_plugin_unavailable:
929+
logger.warning(
930+
"Unable to initialize the TensorRT-LLM plugin library",
931+
exc_info=e_plugin_unavailable,
932+
)
933+
return False
934+
935+
try:
936+
# Initialize the plugin
937+
TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
938+
if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
939+
logger.info("TensorRT-LLM plugin successfully initialized")
940+
return True
941+
else:
942+
logger.warning("TensorRT-LLM plugin library failed in initialization")
943+
return False
944+
except Exception as e_initialization_error:
945+
logger.warning(
946+
"Exception occurred during TensorRT-LLM plugin library initialization",
947+
exc_info=e_initialization_error,
948+
)
949+
return False
950+
return False

setup.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
__version__: str = "0.0.0"
2929
__cuda_version__: str = "0.0"
3030
__tensorrt_version__: str = "0.0"
31+
__tensorrt_llm_version__: str = "0.0"
3132

3233
LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")
3334

@@ -63,6 +64,7 @@ def get_base_version() -> str:
6364
def load_dep_info():
6465
global __cuda_version__
6566
global __tensorrt_version__
67+
global __tensorrt_llm_version__
6668
with open("dev_dep_versions.yml", "r") as stream:
6769
versions = yaml.safe_load(stream)
6870
if (gpu_arch_version := os.environ.get("CU_VERSION")) is not None:
@@ -72,6 +74,7 @@ def load_dep_info():
7274
else:
7375
__cuda_version__ = versions["__cuda_version__"]
7476
__tensorrt_version__ = versions["__tensorrt_version__"]
77+
__tensorrt_llm_version__ = versions["__tensorrt_llm_version__"]
7578

7679

7780
load_dep_info()
@@ -240,6 +243,7 @@ def gen_version_file():
240243
f.write('__version__ = "' + __version__ + '"\n')
241244
f.write('__cuda_version__ = "' + __cuda_version__ + '"\n')
242245
f.write('__tensorrt_version__ = "' + __tensorrt_version__ + '"\n')
246+
f.write('__tensorrt_llm_version__ = "' + __tensorrt_llm_version__ + '"\n')
243247

244248

245249
def copy_libtorchtrt(multilinux=False, rt_only=False):

0 commit comments

Comments
 (0)