Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
81c6507
Improve documentation after trying on a new machine.
SandSnip3r Jan 8, 2026
299b095
Clarified instructions.
SandSnip3r Jan 8, 2026
3ddb261
Merge remote-tracking branch 'upstream/main'
SandSnip3r Jan 13, 2026
90d6cd2
Implement rms norm converter.
SandSnip3r Jan 15, 2026
80d63a2
Add test for rms norm converter.
SandSnip3r Jan 15, 2026
723f5fb
Merge branch 'main' into rms_norm_converter
SandSnip3r Jan 15, 2026
29e4e00
Merge branch 'pytorch:main' into rms_norm_converter
SandSnip3r Jan 21, 2026
c122176
Fix the converter issue caused by this missing unset_fake_temporarily…
wenbingl Jan 22, 2026
4da0cf0
chore: update uv.lock
Jan 26, 2026
183f7f0
lowering pass: fully remove SymInt (#4001)
zewenli98 Jan 26, 2026
824a770
fix the layer info test failure and deal with potential segfault (#4042)
narendasan Jan 27, 2026
240ce8e
cherry pick 4033: skip llm test if modelopt is not installed from rel…
lanluo-nvidia Jan 27, 2026
f522436
cherry pick 4038 from ngc release branch to main: skip failed test on…
lanluo-nvidia Jan 27, 2026
84b3f8f
cherry pick 4028: fix resource partitioner issue from release branch …
lanluo-nvidia Jan 27, 2026
19e20e6
cherry pick 4029: upgrade aarch64 base image from release branch to m…
lanluo-nvidia Jan 27, 2026
e6415e3
fix: torchtrtc precision setting logic (#3883)
yeetypete Jan 29, 2026
61cbf2e
Empty tensor handling (#3891)
apbose Jan 30, 2026
5edae8b
chore: update uv.lock
Feb 9, 2026
aec7e45
fix: example argument issue raised in 4070 (#4071)
zewenli98 Feb 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/scripts/generate-release-matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"wheel": ["3.10", "3.11", "3.12", "3.13"],
"tarball": ["3.11"],
}
sbsa_container_image: str = "quay.io/pypa/manylinux_2_34_aarch64"
sbsa_container_image: str = "quay.io/pypa/manylinux_2_39_aarch64"

CXX11_TARBALL_CONTAINER_IMAGE = {
"cu130": "pytorch/libtorch-cxx11-builder:cuda13.0-main",
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -413,5 +413,5 @@ jobs:
PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{inputs.is-release-wheel}}-${{inputs.is-release-tarball}}-${{inputs.use-rtx}}-${{inputs.architecture}}-${{inputs.is-jetpack}}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{inputs.is-release-wheel}}-${{inputs.is-release-tarball}}-${{inputs.use-rtx}}-${{inputs.architecture}}-${{inputs.is-jetpack}}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ startsWith(github.ref, 'refs/tags/') && github.ref_name || 'no-tag' }}
cancel-in-progress: true
2 changes: 1 addition & 1 deletion .github/workflows/build_windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -438,5 +438,5 @@ jobs:
architecture: ${{ inputs.architecture }}

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ inputs.repository }}-${{ inputs.is-release-wheel }}-${{ inputs.is-release-tarball }}-${{ github.event_name == 'workflow_dispatch' }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ inputs.repository }}-${{ inputs.is-release-wheel }}-${{ inputs.is-release-tarball }}-${{ github.event_name == 'workflow_dispatch' }}-${{ startsWith(github.ref, 'refs/tags/') && github.ref_name || 'no-tag' }}
cancel-in-progress: true
3 changes: 2 additions & 1 deletion .github/workflows/release-linux-aarch64.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: Release aarch64 Linux wheels and tarball artifacts

on:
pull_request:
push:
tags:
# NOTE: Binary build pipelines should only get triggered on release candidate builds
Expand Down Expand Up @@ -128,5 +129,5 @@ jobs:
architecture: "aarch64"

concurrency:
group: ${{ github.workflow }}-aarch64-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
group: ${{ github.workflow }}-aarch64-release-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
cancel-in-progress: true
2 changes: 1 addition & 1 deletion .github/workflows/release-linux-x86_64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,5 +126,5 @@ jobs:
is-release-wheel: true

concurrency:
group: ${{ github.workflow }}-x86_64-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
group: ${{ github.workflow }}-x86_64-release-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
cancel-in-progress: true
8 changes: 7 additions & 1 deletion core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ TRTEngine::TRTEngine(
}
TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");

// Pre-allocate placeholder for empty tensors (TensorRT requires non-null addresses)
cudaMalloc(&empty_tensor_placeholder, 1);

runtime_states.old_cudagraphs = CUDAGRAPHS_MODE;
runtime_states.old_pre_allocated_outputs = false;
runtime_states.context_changed = false;
Expand Down Expand Up @@ -264,6 +267,9 @@ TRTEngine::~TRTEngine() {
trt_engine_profiler.reset();
exec_ctx.reset();
cuda_engine.reset();
if (empty_tensor_placeholder) {
cudaFree(empty_tensor_placeholder);
}
rt.reset();
}

Expand Down Expand Up @@ -315,7 +321,7 @@ void TRTEngine::set_profile_format(std::string format) {
}

std::string TRTEngine::get_engine_layer_info() {
auto inspector = cuda_engine->createEngineInspector();
auto inspector = make_trt(cuda_engine->createEngineInspector());
return inspector->getEngineInformation(nvinfer1::LayerInformationFormat::kJSON);
}

Expand Down
3 changes: 3 additions & 0 deletions core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ struct TRTEngine : torch::CustomClassHolder {
bool use_pre_allocated_outputs = false;
std::vector<at::Tensor> pre_allocated_outputs;

// Single placeholder buffer for empty tensor inputs (allocated once, reused)
void* empty_tensor_placeholder = nullptr;

// Output Allocator-Related Functionality
bool requires_output_allocator = false; // engine requires output allocator
bool use_output_allocator_outputs = false; // users specify to use output allocator
Expand Down
20 changes: 14 additions & 6 deletions core/runtime/execute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,18 +149,26 @@ void setup_input_tensors(
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");

at::Tensor final_input;
if (cudagraphs_enabled) {
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
"Error while setting the input tensor address for inputs");
final_input = compiled_engine->input_buffers[i];
} else {
// Otherwise use the formatted buffer directly
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
"Error while setting the input tensor address for inputs");
final_input = formatted_inputs.back();
}

// Get tensor address, using placeholder for empty tensors
// TensorRT requires non-null address even if numel() = 0
// empty_tensor_placeholder is pre-allocated in TRTEngine constructor
void* input_addr = (final_input.numel() == 0 || final_input.data_ptr() == nullptr)
? compiled_engine->empty_tensor_placeholder
: final_input.data_ptr();

TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), input_addr),
"Failed to bind tensor address for " << name);
}
}
}
Expand Down
1 change: 1 addition & 0 deletions cpp/bin/torchtrtc/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ int main(int argc, char** argv) {
}

if (enabled_precisions) {
compile_settings.enabled_precisions.clear();
for (const auto& precision : args::get(enabled_precisions)) {
auto dtype = torchtrtc::parserutil::parse_dtype(precision);
if (dtype == torchtrt::DataType::kFloat) {
Expand Down
58 changes: 58 additions & 0 deletions examples/dynamo/compile_with_dynamic_inputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import logging

import torch
import torch.nn as nn
import torch_tensorrt

logging.basicConfig(level=logging.DEBUG)

torch.manual_seed(0)


class ExpandReshapeModel(nn.Module):
def __init__(self, embed_dim: int):
super().__init__()
self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
self.embed_dim = embed_dim
self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3)

def forward(self, x: torch.Tensor):
batch_size = x.shape[0]
cls_token = self.cls_token.expand(batch_size, -1, -1)
x = torch.cat([cls_token, x], dim=1)
x = self.qkv_proj(x)
reshaped_qkv = x.reshape(batch_size, x.size(1), 3, 12, -1)
return reshaped_qkv


model = ExpandReshapeModel(embed_dim=768).cuda().eval()
x = torch.randn(4, 196, 768).cuda()

# 1. JIT: torch.compile
x1 = x.clone()
torch._dynamo.mark_dynamic(x1, index=0, min=2, max=32)
trt_module = torch.compile(model, backend="tensorrt")
out1 = trt_module(x1)

# 2. AOT: torch_tensorrt.compile
x2 = x.clone()
example_input = torch_tensorrt.Input(
min_shape=[1, 196, 768],
opt_shape=[4, 196, 768],
max_shape=[32, 196, 768],
dtype=torch.float32,
)
trt_module = torch_tensorrt.compile(model, ir="dynamo", inputs=example_input)
out2 = trt_module(x2)

# 3. AOT: torch.export + Dynamo compile
x3 = x.clone()
bs = torch.export.Dim("bs", min=1, max=32)
dynamic_shapes = {"x": {0: bs}}
exp_program = torch.export.export(model, (x3,), dynamic_shapes=dynamic_shapes)
trt_module = torch_tensorrt.dynamo.compile(exp_program, (x3,))
out3 = trt_module(x3)

assert torch.allclose(out1, out2)
assert torch.allclose(out1, out3)
assert torch.allclose(out2, out3)
3 changes: 3 additions & 0 deletions examples/dynamo/torch_compile_resnet_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
model,
ir="torch_compile",
inputs=inputs,
use_explicit_typing=False,
enabled_precisions=enabled_precisions,
workspace_size=workspace_size,
min_block_size=min_block_size,
Expand Down Expand Up @@ -86,6 +87,7 @@
model,
ir="torch_compile",
inputs=inputs_bs8,
use_explicit_typing=False,
enabled_precisions=enabled_precisions,
workspace_size=workspace_size,
min_block_size=min_block_size,
Expand All @@ -111,6 +113,7 @@
dtype=torch.half,
)
],
"use_explicit_typing": False,
"enabled_precisions": enabled_precisions,
"ir": "dynamo",
}
Expand Down
16 changes: 14 additions & 2 deletions py/torch_tensorrt/_Input.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
from __future__ import annotations

import logging
from enum import Enum
from typing import Any, Dict, List, Optional, Sequence, Tuple

import torch
from torch_tensorrt._enums import dtype, memory_format

logger = logging.getLogger(__name__)


class Input(object):
"""
Expand Down Expand Up @@ -149,6 +152,16 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
}
self.shape_mode = Input._ShapeMode.DYNAMIC

# Warn if min_shape has any 0 dimension (empty tensor) - TensorRT doesn't support this
# @apbose: Is this warning necessary?
if any(dim == 0 for dim in self.shape["min_shape"]):
logger.warning(
f"min_shape contains a 0 dimension: {self.shape['min_shape']}. "
"TensorRT does not support dynamic shapes with min dimension of 0 (empty tensors). "
"TensorRT will internally clamp min dimensions to 1, which may cause runtime errors "
"if you try to run inference with empty tensor inputs."
)

else:
raise ValueError(
f"Unexpected number of positional arguments for class Input \n Found {len(args)} arguments, expected either zero or a single positional arguments"
Expand Down Expand Up @@ -384,7 +397,7 @@ def example_tensor(
dtype=self.dtype.to(torch.dtype, use_default=True)
)
else:
RuntimeError(
raise RuntimeError(
f"Input shape is dynamic but shapes are not provided as sequence (found: {self.shape})"
)
else:
Expand Down Expand Up @@ -412,4 +425,3 @@ def example_tensor(
raise ValueError(
"Requested an example tensor from a dynamic shaped input but did not specific which profile field to use."
)
raise
6 changes: 6 additions & 0 deletions py/torch_tensorrt/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ def is_tegra_platform() -> bool:
return False


def is_orin() -> bool:
if torch.cuda.get_device_capability() in [(8, 7)]:
return True
return False


def is_thor() -> bool:
if torch.cuda.get_device_capability() in [(11, 0)]:
return True
Expand Down
23 changes: 9 additions & 14 deletions py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import tensorrt as trt
import torch
import torch_tensorrt
from torch import SymBool, SymFloat, SymInt
from torch._ops import OpOverloadPacket
from torch.fx.node import Argument, Node, Target, _get_qualified_name
Expand Down Expand Up @@ -536,7 +535,7 @@ def __contains__(self, key: Target | Node) -> bool:
def get_all_converters_with_target(
self, key: Target, return_registry_info: bool = False
) -> Tuple[
Union[List[Any], Dict[str, int], None]
List[Any], Optional[Dict[str, int]]
]: # TODO: Narrow to ConverterImplSignature this when we can remove FX converters
"""Get all converters across all registries for the target

Expand All @@ -547,7 +546,7 @@ def get_all_converters_with_target(

# Store count of number of registered converters per registry
if return_registry_info:
registry_data = {name: 0 for name in self.registry_names}
registry_data = dict.fromkeys(self.registry_names, 0)

for index, registry in enumerate(self.registries):
if key in registry:
Expand Down Expand Up @@ -622,22 +621,18 @@ def display_all_available_converters(self) -> str:
return available_converters


# Initialize dynamo converter registry with the FX and Dynamo aten registries
# Note the Dynamo registry is listed first, for precedence
registries = [
DYNAMO_ATEN_CONVERTERS,
# Initialize dynamo converter registry with Dynamo aten converters only
# FX converters are not loaded here - they are legacy and should only be used
# in the FX frontend, not as fallbacks in the dynamo frontend
registries: List[
Dict[Target, Union[Callable[..., Any], Sequence[ConverterSupport]]]
] = [
DYNAMO_ATEN_CONVERTERS, # type: ignore[list-item]
]
registry_names = ["Dynamo ATen Converters Registry"]
registry_calling_conventions = [
CallingConvention.CTX,
]
if torch_tensorrt.ENABLED_FEATURES.fx_frontend:
from torch_tensorrt.fx.converter_registry import CONVERTERS as FX_CONVERTERS

registries.append(FX_CONVERTERS)
registry_names.append("FX Legacy ATen Converters Registry")
registry_calling_conventions.append(CallingConvention.LEGACY)


DYNAMO_CONVERTERS: ConverterRegistry = ConverterRegistry(
registries,
Expand Down
Loading