Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions csrc/device_lower/analysis/fused_reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,11 @@ class FusionInspector : private IterVisitor {
// TIDx and its size is a multiple of warp size (32).
auto is_static_warp_reduction = [](TensorView* out,
bool has_warp_specialization) {
if (!has_warp_specialization) {
// Check if bdimx is statically known in launch params
bool has_static_bdimx = GpuLower::hasCurrent() &&
GpuLower::current()->launchParams().hasDim(ParallelType::TIDx);

if (!has_warp_specialization && !has_static_bdimx) {
return false;
}

Expand All @@ -97,10 +101,19 @@ class FusionInspector : private IterVisitor {
for (auto ld : out->getLoopDomain()) {
if (ld->isReduction()) {
reduction_count++;
if (ld->getParallelType() == ParallelType::TIDx &&
ld->extent()->isConst() &&
ld->extent()->value().as<int64_t>() % kThreadsPerWarp == 0) {
has_valid_tidx_reduction = true;
if (ld->getParallelType() == ParallelType::TIDx) {
// Get extent either from launch params or from the const extent
std::optional<int64_t> extent;
if (has_static_bdimx) {
extent = GpuLower::current()->launchParams().getDim(
ParallelType::TIDx);
} else if (ld->extent()->isConst()) {
extent = ld->extent()->value().as<int64_t>();
}

if (extent.has_value() && extent.value() % kThreadsPerWarp == 0) {
has_valid_tidx_reduction = true;
}
}
}
}
Expand Down
8 changes: 6 additions & 2 deletions csrc/device_lower/lower2device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,10 @@ void dumpExprsIfEnabled(
}
}

GpuLower::GpuLower(Fusion* fusion, const CompileParams& cparams)
GpuLower::GpuLower(
Fusion* fusion,
const CompileParams& cparams,
const LaunchParams& lparams)
: passes_(
// Passes will be executed in the order they are added here
// Each pass is a pair of (name, function), where the name will be
Expand Down Expand Up @@ -234,7 +237,8 @@ GpuLower::GpuLower(Fusion* fusion, const CompileParams& cparams)
{"KIRCleaner", KIRCleaner::cleanUp},
{"instrumentKernel", instrumentKernel},
{"lowerToInlinePtx", lowerToInlinePtx}}),
cparams_(cparams) {
cparams_(cparams),
lparams_(lparams) {
if (isDebugDumpEnabled(DebugDumpOption::FusionIrMath)) {
fusion->printMath();
}
Expand Down
8 changes: 7 additions & 1 deletion csrc/device_lower/lower2device.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ class GpuLower : public NonCopyable {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
NVF_API explicit GpuLower(
Fusion* fusion,
const CompileParams& cparams = CompileParams());
const CompileParams& cparams = CompileParams(),
const LaunchParams& lparams = LaunchParams());

NVF_API kir::Kernel* kernel() const;

Expand All @@ -83,6 +84,10 @@ class GpuLower : public NonCopyable {
return cparams_.index_type.value();
}

const LaunchParams& launchParams() const {
return lparams_;
}

const auto& minDeviceVersion() const {
return min_device_version_;
}
Expand Down Expand Up @@ -391,6 +396,7 @@ class GpuLower : public NonCopyable {
kir::KernelPerformanceProfile profile_;
std::unordered_set<Split*> divisible_splits_;
CompileParams cparams_;
LaunchParams lparams_;
std::unique_ptr<TensorIndexer> tensor_indexer_;
std::unordered_map<TensorView*, const TMAInfo> consumer_to_tma_info_;
std::pair<int64_t, int64_t> dec_inc_register_usage = {-1, -1};
Expand Down
13 changes: 9 additions & 4 deletions csrc/parallel_dimension_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,15 @@ int64_t ParallelDimensionMap::getThreadCountInDim(ParallelType pt) {
if (dim_map_.at(pt)->isConstScalar()) {
return dim_map_.at(pt)->value().as<int64_t>();
}
// Return -1 for dynamic dimensions, this disables register sharing on
// dynamic dimensions since we can't guarantee the number of threads is
// divisible by 128. We may allow this in the future and delegate this
// check to a point where the launch parameters are known.
// If dimension is dynamic but we have launch parameters available,
// use the actual launch parameter value
if (GpuLower::hasCurrent() &&
GpuLower::current()->launchParams().hasDim(pt)) {
return GpuLower::current()->launchParams().getDim(pt);
}
// Return -1 for dynamic dimensions when launch parameters are not known,
// this disables register sharing on dynamic dimensions since we can't
// guarantee the number of threads is divisible by 128.
return -1;
}

Expand Down
6 changes: 5 additions & 1 deletion csrc/runtime/compiled_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1225,6 +1225,7 @@ void queryTargetGPUVersion(
NVF_API CompiledKernel::CompiledKernel(
Fusion* fusion,
CompileParams compile_params,
const LaunchParams& launch_params,
c10::Device device,
SchedulerType scheduler_type,
int64_t fusion_id,
Expand All @@ -1241,7 +1242,8 @@ NVF_API CompiledKernel::CompiledKernel(
runtime_id,
group_id),
compile_params_(compile_params),
lowered_(std::make_unique<GpuLower>(fusion, compile_params)) {
lowered_(
std::make_unique<GpuLower>(fusion, compile_params, launch_params)) {
FUSER_PERF_SCOPE("CompiledKernel::CompiledKernel");

// TODO: No hooks can be sent because this is in the constructor
Expand Down Expand Up @@ -1270,6 +1272,7 @@ NVF_API CompiledKernel::CompiledKernel(
NVF_API CompiledKernel::CompiledKernel(
Fusion* fusion,
CompileParams compile_params,
const LaunchParams& launch_params,
c10::Device device,
SchedulerType scheduler_type,
int64_t fusion_id,
Expand All @@ -1279,6 +1282,7 @@ NVF_API CompiledKernel::CompiledKernel(
: CompiledKernel(
fusion,
compile_params,
launch_params,
device,
scheduler_type,
fusion_id,
Expand Down
2 changes: 2 additions & 0 deletions csrc/runtime/compiled_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ class CompiledKernel : public CompiledKernelBase {
NVF_API CompiledKernel(
Fusion* fusion,
CompileParams compile_params,
const LaunchParams& launch_params,
c10::Device device,
SchedulerType scheduler_type,
int64_t fusion_id,
Expand All @@ -199,6 +200,7 @@ class CompiledKernel : public CompiledKernelBase {
NVF_API CompiledKernel(
Fusion* fusion,
CompileParams compile_params,
const LaunchParams& launch_params,
c10::Device device,
SchedulerType scheduler_type = SchedulerType::None,
int64_t fusion_id = 0,
Expand Down
2 changes: 2 additions & 0 deletions csrc/runtime/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ void KernelExecutor::compile(
compiled_kernel_ = std::make_unique<CompiledKernel>(
fusion,
compile_params,
launch_constraints,
device,
scheduler_type,
fusion_id_,
Expand Down Expand Up @@ -1608,6 +1609,7 @@ void KernelExecutor::deserialize(
compiled_kernel_ = std::make_unique<CompiledKernel>(
_fusion,
compile_params,
LaunchParams(),
device,
scheduler_type,
fusion_id,
Expand Down
12 changes: 9 additions & 3 deletions csrc/scheduler/reduction_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,15 @@ TensorView* scheduleReductionTV(
// Reduction: [Persistent, TIDx, Vect]
vectorize(inner_reduce_axis, rparams->unroll_factor_inner_reduction);

// static bdimx is required for TMA warp specialization
int64_t compute_bdimx = getComputeBdimx(option, rparams->lparams.bdimx());
inner_parallel_static(inner_reduce_axis, ParallelType::TIDx, compute_bdimx);
reduction_tv->split(
inner_reduce_axis, rparams->batches_per_block_inner_reduction, false);
reduction_tv->axis(inner_reduce_axis + 1)->parallelize(ParallelType::TIDx);
reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp();

// // static bdimx is required for TMA warp specialization
// int64_t compute_bdimx = getComputeBdimx(option,
// rparams->lparams.bdimx()); inner_parallel_static(inner_reduce_axis,
// ParallelType::TIDx, compute_bdimx);
Comment on lines +137 to +140
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment formatting is broken - line breaks should be after the comment prefix.

Suggested change
// // static bdimx is required for TMA warp specialization
// int64_t compute_bdimx = getComputeBdimx(option,
// rparams->lparams.bdimx()); inner_parallel_static(inner_reduce_axis,
// ParallelType::TIDx, compute_bdimx);
// static bdimx is required for TMA warp specialization
// int64_t compute_bdimx = getComputeBdimx(option,
// rparams->lparams.bdimx());
// inner_parallel_static(inner_reduce_axis, ParallelType::TIDx, compute_bdimx);

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!


// Iteration: [I/Unroll/BIDy, BIDy, Unroll]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrong comment prefix - should be // not /.

Suggested change
// Iteration: [I/Unroll/BIDy, BIDy, Unroll]
// Iteration: [I/Unroll/BIDy, BIDy, Unroll]

if (rparams->unroll_factor_iter_dom > 1) {
Expand Down
7 changes: 5 additions & 2 deletions tests/cpp/test_combined_inner_outer_reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1087,8 +1087,11 @@ TEST_P(TmaWarpSpecializedTest, SimpleFusion) {

auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
auto tv0 = makeContigConcreteTensor({dim0, dim1}, dtype);
auto tv1 = makeContigConcreteTensor({dim0, dim1}, dtype);
// For case contig_1_dtype_float_batch_2048_hidden_8192
// the performance is 59.7% SOL uisng concrete inputs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: 'uisng' should be 'using'.

Suggested change
// the performance is 59.7% SOL uisng concrete inputs
// the performance is 59.7% SOL using concrete inputs

// for symbolic inputs, the performance is 59.1% SOL
auto tv0 = makeContigTensor(2, dtype);
auto tv1 = makeContigTensor(2, dtype);
fusion->addInput(tv0);
fusion->addInput(tv1);
tv0 = maybeCastOp(DataType::Float, tv0);
Expand Down
91 changes: 86 additions & 5 deletions tests/cpp/test_persistent_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2196,8 +2196,8 @@ TEST_P(TmaPersistentTestP, TmaInnerPersistentRmsNorm) {
const float kEps = 1e-6;
Val* eps_ptr = IrBuilder::create<Val>(kEps);

auto tv0 = makeContigConcreteTensor({x, y}, dtype);
auto tv1 = makeContigConcreteTensor({y}, dtype);
auto tv0 = makeContigTensor(2, dtype);
auto tv1 = makeContigTensor(1, dtype);
fusion.addInput(tv0);
fusion.addInput(tv1);
tv0 = maybeCastOp(DataType::Float, tv0);
Expand Down Expand Up @@ -2271,7 +2271,7 @@ TEST_P(TmaPersistentTestP, TmaInnerPersistentSoftmax) {
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());

auto tv0 = makeContigTensor(2, dtype);
auto tv0 = makeContigConcreteTensor({x, y}, dtype);
fusion.addInput(tv0);
tv0 = maybeCastOp(DataType::Float, tv0);
auto res = softmax(tv0, 1);
Expand All @@ -2297,8 +2297,8 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Combine(
testing::Values(DataType::BFloat16),
testing::Values(
deviceSMCount() / 2,
1024), // batch size, less or larger than sm count
deviceSMCount() / 2, // small batch, can't do grid stride loop
2048), // batch size, less or larger than sm count
testing::ValuesIn(Pow2Vals1to1Million)), // hidden size
[](const testing::TestParamInfo<TmaPersistentTestParams>& info) {
auto dtype = std::get<0>(info.param);
Expand All @@ -2308,4 +2308,85 @@ INSTANTIATE_TEST_SUITE_P(
os << dtype << "_" << x << "_" << y;
return os.str();
});

// Test that kernels with different launch parameters are not incorrectly reused
// This ensures that LaunchParams is properly included in the cache key
TEST_F(TmaPersistentTestF, KernelReuse) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());

// Create an RMS norm fusion that will use inner persistent scheduler
const float kEps = 1e-6;
Val* eps_ptr = IrBuilder::create<Val>(kEps);

auto tv0 = makeContigTensor(2, DataType::BFloat16);
auto tv1 = makeContigTensor(1, DataType::BFloat16);
fusion.addInput(tv0);
fusion.addInput(tv1);
tv0 = maybeCastOp(DataType::Float, tv0);
tv1 = maybeCastOp(DataType::Float, tv1);
auto rms_norm_results = rms_norm(tv0, 1, tv1, eps_ptr);
auto output = maybeCastOp(DataType::BFloat16, rms_norm_results.output);
fusion.addOutput(output);

FusionExecutorCache executor_cache(std::move(fusion_ptr));

auto options = at::TensorOptions().dtype(at::kBFloat16).device(at::kCUDA, 0);

// Helper to get the number of compiled kernel runtimes
auto numRuntimes = [&executor_cache]() -> size_t {
// this is map<pair<device, conc_info>, vector<FusionKernelRuntime>>
const auto& runtime_map = executor_cache.getKernelRuntimes();
if (runtime_map.empty()) {
return 0;
}
return runtime_map
.begin() // There should be only one device/concretization pair
->second.size();
};

// First run with specific dimensions that will produce launch config A
auto input1 = at::randn({2048, 4096}, options);
auto weight1 = at::randn({4096}, options);

auto output1 = executor_cache.runFusionWithInputs({input1, weight1});
testValidate(
executor_cache.fusion(), output1, {input1, weight1}, __LINE__, __FILE__);

EXPECT_EQ(numRuntimes(), 1) << "First run should compile one kernel";

FusionKernelRuntime* first_runtime =
executor_cache.getMostRecentKernelRuntime();

// Second run with different outer dimension - should reuse the kernel
auto input2 = at::randn({2048 + 8, 4096}, options);
auto weight2 = at::randn({4096}, options);

auto output2 = executor_cache.runFusionWithInputs({input2, weight2});
testValidate(
executor_cache.fusion(), output2, {input2, weight2}, __LINE__, __FILE__);

EXPECT_EQ(numRuntimes(), 1)
<< "Same dimensions should reuse the existing kernel";
Comment on lines +2370 to +2371
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment says "Same dimensions should reuse" but the test uses different outer dimension (2048 + 8 vs 2048) - comment is misleading.


FusionKernelRuntime* second_runtime =
executor_cache.getMostRecentKernelRuntime();
EXPECT_EQ(first_runtime, second_runtime)
<< "Should reuse the same runtime for identical shapes";
Comment on lines +2370 to +2376
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: the comments are misleading here. Line 2370 says "Same dimensions should reuse the existing kernel" but the shapes are different ({2048, 4096} vs {2048 + 8, 4096}). Line 2376 says "Should reuse the same runtime for identical shapes" but the shapes are not identical. Consider clarifying that the outer dimension changes but should still reuse the kernel due to dynamic shape support.

Suggested change
EXPECT_EQ(numRuntimes(), 1)
<< "Same dimensions should reuse the existing kernel";
FusionKernelRuntime* second_runtime =
executor_cache.getMostRecentKernelRuntime();
EXPECT_EQ(first_runtime, second_runtime)
<< "Should reuse the same runtime for identical shapes";
EXPECT_EQ(numRuntimes(), 1)
<< "Different outer dimension should reuse the existing kernel due to dynamic shape support";
FusionKernelRuntime* second_runtime =
executor_cache.getMostRecentKernelRuntime();
EXPECT_EQ(first_runtime, second_runtime)
<< "Should reuse the same runtime despite different outer dimension";


// Third run with slightly different inner dimension - should reuse the kernel
auto input3 = at::randn({2048 + 8, 4096 - 8}, options);
auto weight3 = at::randn({4096 - 8}, options);

auto output3 = executor_cache.runFusionWithInputs({input3, weight3});
testValidate(
executor_cache.fusion(), output3, {input3, weight3}, __LINE__, __FILE__);

// If launch params are properly included in cache, this should compile a new
// kernel
EXPECT_GE(numRuntimes(), 1)
<< "Different dimensions may create new kernel if launch params differ";
}

} // namespace nvfuser