llvm · qedawkins · May 2, 2025
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -124,6 +124,17 @@ struct SCFTilingOptions {
     mappingVector = llvm::to_vector(mapping);
     return *this;
   }
+
+  /// Gives hints for whether the tile sizes divide the iteration space evenly.
+  /// For static sizes, this is trivially verifiable (and the helpers here take
+  /// advantage of that), however for dynamic sizes we are always forced to be
+  /// pessimistic. This allows external analysis to check for divisibility and
+  /// pass on the info to tiling.
+  SmallVector<bool> divisibilityHint = {};
+  SCFTilingOptions &setDivisibilityHint(ArrayRef<bool> hint) {
+    divisibilityHint.assign(hint.begin(), hint.end());
+    return *this;
+  }
 };
 
 /// Transformation information returned after tiling.

diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -248,7 +248,8 @@ static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>>
 getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs,
                       ArrayRef<Range> iterationDomain,
                       ArrayRef<OpFoldResult> tileSizes,
-                      ArrayRef<OpFoldResult> numThreads) {
+                      ArrayRef<OpFoldResult> numThreads,
+                      ArrayRef<bool> divisibilityHint) {
   SmallVector<OpFoldResult> offsets, sizes;
   int materializedLoopNum = 0;
 
@@ -260,8 +261,8 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs,
     offsetExpr = d0 + d1 * s0;
     residualTileSizeExpr = s1 - (d0 + d1 * s0);
 
-    for (auto [nt, tileSize, loopRange] :
-         llvm::zip_equal(numThreads, tileSizes, iterationDomain)) {
+    for (auto [nt, tileSize, loopRange, divHint] : llvm::zip_equal(
+             numThreads, tileSizes, iterationDomain, divisibilityHint)) {
 
       // Non-tiled cases, set the offset and size to the
       // `loopRange.offset/size`.
@@ -280,7 +281,7 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs,
           {loopRange.offset, nt, tileSize, loopRange.size});
 
       OpFoldResult size = tileSize;
-      if (!isConstantIntValue(residualTileSize, 0)) {
+      if (!isConstantIntValue(residualTileSize, 0) && !divHint) {
         OpFoldResult sizeMinusOffsetPerThread =
             affine::makeComposedFoldedAffineApply(rewriter, loc, s0 - d0,
                                                   {offset, loopRange.size});
@@ -299,7 +300,8 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs,
       // `nonNegativeTileSize = affine.max(0, tileSize)`.
       // This `max` can be avoided if
       //  `offset + tileSize * (numThreads - 1) < (ub - lb)`
-      if (!canOmitTileOffsetInBoundsCheck(tileSize, nt, loopRange.size)) {
+      if (!canOmitTileOffsetInBoundsCheck(tileSize, nt, loopRange.size) &&
+          !divHint) {
         AffineMap maxMap =
             AffineMap::getMultiDimIdentityMap(2, rewriter.getContext());
         size = affine::makeComposedFoldedAffineMax(
@@ -311,8 +313,8 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs,
     }
     return {offsets, sizes};
   } else {
-    for (auto [tileSize, loopRange] :
-         llvm::zip_equal(tileSizes, iterationDomain)) {
+    for (auto [tileSize, loopRange, divHint] :
+         llvm::zip_equal(tileSizes, iterationDomain, divisibilityHint)) {
 
       // Non-tiled cases, set the offset and size to the
       // `loopRange.offset/size`.
@@ -325,8 +327,9 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs,
       Value iv = ivs[materializedLoopNum++];
       OpFoldResult offset = getAsOpFoldResult(iv);
       offsets.push_back(offset);
-      OpFoldResult size =
-          getBoundedTileSize(rewriter, loc, loopRange, offset, tileSize);
+      OpFoldResult size = divHint ? tileSize
+                                  : getBoundedTileSize(rewriter, loc, loopRange,
+                                                       offset, tileSize);
       sizes.push_back(size);
     }
     return {offsets, sizes};
@@ -950,6 +953,11 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
   std::tie(tileSizes, numThreads) =
       getUserTileSizesAndNumThreads(rewriter, op, iterationDomain, options);
 
+  // 2a. Pad the divisibility hints to the domain rank.
+  SmallVector<bool> divisibilityHint = options.divisibilityHint;
+  divisibilityHint.append(iterationDomain.size() - divisibilityHint.size(),
+                          false);
+
   // Check if it is safe to tile. This is hold over from previous iterations
   // of tile to for-all. Consider dropping it.
   if (options.loopType == scf::SCFTilingOptions::LoopType::ForallOp) {
@@ -982,8 +990,9 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
       -> LogicalResult {
     // 4a. Compute the `offsets` and `sizes` to use for tiling.
     SmallVector<OpFoldResult> offsets, sizes;
-    std::tie(offsets, sizes) = getTileOffsetAndSizes(
-        rewriter, loc, ivs, iterationDomain, tileSizes, numThreads);
+    std::tie(offsets, sizes) =
+        getTileOffsetAndSizes(rewriter, loc, ivs, iterationDomain, tileSizes,
+                              numThreads, divisibilityHint);
 
     // 4b. If interchange was provided, apply inverse of the interchange
     //     to get back the offsets/sizes in the order to be specified.

diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir
@@ -349,3 +349,99 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: func @check_scalar_memref_operation
 //   CHECK-NOT:   scf.for
 //       CHECK:   linalg.generic
+
+// -----
+
+func.func @simple_matmul_assume_divisible_n(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
+    %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul
+    ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    %a, %b = transform.test.tile_using_forall %matmul [10, 20]
+      divisibility_hint = [false, true] mapping = [#gpu.block<y>, #gpu.block<x>]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//      CHECK: func.func @simple_matmul_assume_divisible_n(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[M:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//  CHECK-DAG:   %[[K:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+//  CHECK-DAG:   %[[N:.+]] = tensor.dim %[[ARG1]], %[[C1]]
+//      CHECK:   %[[RESULT:.+]] = scf.forall (%[[IV0:[a-zA-Z0-9]+]], %[[IV1:[a-zA-Z0-9]+]]) =
+// CHECK-SAME:       (0, 0) to (%[[M]], %[[N]]) step (10, 20) shared_outs(%[[INIT:.+]] = %[[ARG2]])
+//      CHECK:     %[[TS_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M]]]
+//      CHECK:     %[[LHS_TILE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:         [%[[IV0]], 0] [%[[TS_Y]], %[[K]]] [1, 1]
+//      CHECK:     %[[RHS_TILE:.+]] = tensor.extract_slice %[[ARG1]]
+// CHECK-SAME:         [0, %[[IV1]]] [%[[K]], 20] [1, 1]
+//      CHECK:     %[[INIT_TILE:.+]] = tensor.extract_slice %[[INIT]]
+// CHECK-SAME:         [%[[IV0]], %[[IV1]]] [%[[TS_Y]], 20] [1, 1]
+//      CHECK:     %[[GEMM_TILE:.+]] = linalg.matmul
+// CHECK-SAME:         ins(%[[LHS_TILE]], %[[RHS_TILE]] : tensor<?x?xf32>, tensor<?x20xf32>
+// CHECK-SAME:         outs(%[[INIT_TILE]] :
+//      CHECK:     scf.forall.in_parallel {
+//      CHECK:       tensor.parallel_insert_slice %[[GEMM_TILE]] into %[[INIT]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]] [%[[TS_Y]], 20] [1, 1]
+//      CHECK:       mapping = [#gpu.block<y>, #gpu.block<x>]
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func.func @simple_matmul_extend_divisibility(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
+    %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul
+    ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+    %a, %b = transform.test.tile_using_forall %matmul [10, 20]
+      divisibility_hint = [true] mapping = [#gpu.block<y>, #gpu.block<x>]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+//      CHECK: func.func @simple_matmul_extend_divisibility(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[M:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//  CHECK-DAG:   %[[K:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+//  CHECK-DAG:   %[[N:.+]] = tensor.dim %[[ARG1]], %[[C1]]
+//      CHECK:   %[[RESULT:.+]] = scf.forall (%[[IV0:[a-zA-Z0-9]+]], %[[IV1:[a-zA-Z0-9]+]]) =
+// CHECK-SAME:       (0, 0) to (%[[M]], %[[N]]) step (10, 20) shared_outs(%[[INIT:.+]] = %[[ARG2]])
+//      CHECK:     %[[TS_X:.+]] = affine.min #[[MAP0]](%[[IV1]])[%[[N]]]
+//      CHECK:     %[[LHS_TILE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:         [%[[IV0]], 0] [10, %[[K]]] [1, 1]
+//      CHECK:     %[[RHS_TILE:.+]] = tensor.extract_slice %[[ARG1]]
+// CHECK-SAME:         [0, %[[IV1]]] [%[[K]], %[[TS_X]]] [1, 1]
+//      CHECK:     %[[INIT_TILE:.+]] = tensor.extract_slice %[[INIT]]
+// CHECK-SAME:         [%[[IV0]], %[[IV1]]] [10, %[[TS_X]]] [1, 1]
+//      CHECK:     %[[GEMM_TILE:.+]] = linalg.matmul
+// CHECK-SAME:         ins(%[[LHS_TILE]], %[[RHS_TILE]] : tensor<10x?xf32>, tensor<?x?xf32>
+// CHECK-SAME:         outs(%[[INIT_TILE]] :
+//      CHECK:     scf.forall.in_parallel {
+//      CHECK:       tensor.parallel_insert_slice %[[GEMM_TILE]] into %[[INIT]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]] [10, %[[TS_X]]] [1, 1]
+//      CHECK:       mapping = [#gpu.block<y>, #gpu.block<x>]
+//      CHECK:   return %[[RESULT]]
@@ -21,6 +21,7 @@
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Interfaces/TilingInterface.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 
 #define GET_OP_CLASSES
 #include "TestTilingInterfaceTransformOps.h.inc"
@@ -54,12 +55,11 @@ static llvm::SmallDenseSet<Operation *> collectTiledAndFusedOps(Operation *op) {
 /// Apply a tile and fuse transformation to all payload ops and store both the
 /// tiled operation as well as the created tile loops.
 template <typename Range>
-static LogicalResult
-applyTileAndFuseToAll(RewriterBase &rewriter, Operation *transformOp,
-                      Range &&payloadOps, unsigned numLoops,
-                      ArrayRef<OpFoldResult> tileSizes,
-                      ArrayRef<int64_t> interchange, bool useForall,
-                      TransformResults &transformResults) {
+static LogicalResult applyTileAndFuseToAll(
+    RewriterBase &rewriter, Operation *transformOp, Range &&payloadOps,
+    unsigned numLoops, ArrayRef<OpFoldResult> tileSizes,
+    ArrayRef<int64_t> interchange, ArrayRef<bool> divisibilityHint,
+    bool useForall, TransformResults &transformResults) {
   SmallVector<Operation *> tiledOps;
   SmallVector<SmallVector<Operation *>> loopOps(numLoops);
 
@@ -85,6 +85,7 @@ applyTileAndFuseToAll(RewriterBase &rewriter, Operation *transformOp,
     if (useForall) {
       tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
     }
+    tilingOptions.setDivisibilityHint(divisibilityHint);
 
     scf::SCFTileAndFuseOptions tileAndFuseOptions;
     tileAndFuseOptions.setTilingOptions(tilingOptions);
@@ -151,13 +152,16 @@ transform::TestFuseAndYieldOp::apply(TransformRewriter &rewriter,
   SmallVector<int64_t> tileInterchange =
       extractFromIntegerArrayAttr<int64_t>(getTileInterchange());
 
+  SmallVector<bool> divisibilityHint(
+      getDivisibilityHint().getAsValueRange<BoolAttr>());
+
   SmallVector<OpFoldResult> tileSizesOfr =
       getAsIndexOpFoldResult(rewriter.getContext(), tileSizes);
 
   LogicalResult result = applyTileAndFuseToAll(
       rewriter, getOperation(), state.getPayloadOps(getTarget()),
       tileSizes.size() - llvm::count(tileSizes, 0), tileSizesOfr,
-      tileInterchange, getUseForall(), transformResults);
+      tileInterchange, divisibilityHint, getUseForall(), transformResults);
   return failed(result) ? DiagnosedSilenceableFailure::definiteFailure()
                         : DiagnosedSilenceableFailure::success();
 }
@@ -237,7 +241,8 @@ template <typename Range>
 static LogicalResult
 applyTileToAll(RewriterBase &rewriter, Operation *transformOp,
                Range &&payloadOps, ArrayRef<OpFoldResult> tileSizes,
-               ArrayRef<int64_t> interchange, std::optional<ArrayAttr> mapping,
+               ArrayRef<int64_t> interchange, ArrayRef<bool> divisibilityHint,
+               std::optional<ArrayAttr> mapping,
                TransformResults &transformResults) {
   SmallVector<Operation *> tiledOps;
   SmallVector<Operation *> loopOps;
@@ -251,6 +256,7 @@ applyTileToAll(RewriterBase &rewriter, Operation *transformOp,
     if (mapping) {
       tilingOptions.setMapping(mapping.value().getValue());
     }
+    tilingOptions.setDivisibilityHint(divisibilityHint);
     tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
 
     rewriter.setInsertionPoint(target);
@@ -287,9 +293,12 @@ transform::TestTileUsingForallOp::apply(TransformRewriter &rewriter,
   SmallVector<OpFoldResult> tileSizesOfr =
       getAsIndexOpFoldResult(rewriter.getContext(), tileSizes);
 
-  LogicalResult result =
-      applyTileToAll(rewriter, getOperation(), state.getPayloadOps(getTarget()),
-                     tileSizesOfr, interchange, getMapping(), transformResults);
+  SmallVector<bool> divisibilityHint(
+      getDivisibilityHint().getAsValueRange<BoolAttr>());
+
+  LogicalResult result = applyTileToAll(
+      rewriter, getOperation(), state.getPayloadOps(getTarget()), tileSizesOfr,
+      interchange, divisibilityHint, getMapping(), transformResults);
   return failed(result) ? DiagnosedSilenceableFailure::definiteFailure()
                         : DiagnosedSilenceableFailure::success();
 }
@@ -363,11 +372,15 @@ transform::TestFuseUsingForallOp::apply(TransformRewriter &rewriter,
   SmallVector<int64_t> tileInterchange =
       extractFromIntegerArrayAttr<int64_t>(getInterchange());
 
+  SmallVector<bool> divisibilityHint(
+      getDivisibilityHint().getAsValueRange<BoolAttr>());
+
   scf::SCFTilingOptions tilingOptions;
   tilingOptions.interchangeVector = tileInterchange;
   SmallVector<OpFoldResult> tileSizesOfr =
       getAsIndexOpFoldResult(rewriter.getContext(), tileSizes);
   tilingOptions = tilingOptions.setTileSizes(tileSizesOfr);
+  tilingOptions = tilingOptions.setDivisibilityHint(divisibilityHint);
   tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
   scf::SCFTileAndFuseOptions tileAndFuseOptions;
   tileAndFuseOptions.tilingOptions = tilingOptions;

@@ -38,12 +38,14 @@ def TestFuseAndYieldOp : Op<Transform_Dialect, "test.fuse_and_yield",
     (ins TransformHandleTypeInterface:$target,
         DefaultValuedAttr<I64ArrayAttr, "{}">:$tile_sizes,
         DefaultValuedAttr<I64ArrayAttr, "{}">:$tile_interchange,
+        DefaultValuedOptionalAttr<BoolArrayAttr, "{}">:$divisibility_hint,
         DefaultValuedAttr<BoolAttr, "false">:$use_forall);
   let results = (outs TransformHandleTypeInterface:$transfomed,
       Variadic<TransformHandleTypeInterface>:$loops);
 
   let assemblyFormat = [{
     $target ($tile_sizes^)? (`interchange` $tile_interchange^)?
+    (`divisibility_hint` `=` $divisibility_hint^)?
     (`use_forall` $use_forall^)? attr-dict 
     `:` functional-type(operands, results)
   }];
@@ -91,12 +93,14 @@ def TestTileUsingForallOp : Op<Transform_Dialect, "test.tile_using_forall",
   let arguments = (ins TransformHandleTypeInterface:$target,
                    DefaultValuedAttr<I64ArrayAttr, "{}">:$tile_sizes,
                    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$interchange,
+                   DefaultValuedOptionalAttr<BoolArrayAttr, "{}">:$divisibility_hint,
                    OptionalAttr<DeviceMappingArrayAttr>:$mapping);
   let results = (outs TransformHandleTypeInterface:$tiled_op,
                       Variadic<TransformHandleTypeInterface>:$loops);
 
   let assemblyFormat = [{
     $target ($tile_sizes^)? (`interchange` `=` $interchange^)?
+    (`divisibility_hint` `=` $divisibility_hint^)?
     (`mapping` `=` $mapping^)?
     attr-dict `:` functional-type(operands, results)
   }];
@@ -114,12 +118,14 @@ def TestFuseUsingForallOp : Op<Transform_Dialect, "test.fuse_using_forall",
   let arguments = (ins TransformHandleTypeInterface:$root_op,
                    DefaultValuedAttr<I64ArrayAttr, "{}">:$tile_sizes,
                    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$interchange,
+                   DefaultValuedOptionalAttr<BoolArrayAttr, "{}">:$divisibility_hint,
                    OptionalAttr<DeviceMappingArrayAttr>:$mapping);
   let results = (outs TransformHandleTypeInterface:$tiled_ops,
                       Variadic<TransformHandleTypeInterface>:$loops);
 
   let assemblyFormat = [{
     $root_op ($tile_sizes^)? (`interchange` $interchange^)?
+    (`divisibility_hint` `=` $divisibility_hint^)?
     (`mapping` `=` $mapping^)?
     attr-dict `:` functional-type(operands, results)
   }];