[AMDGPU] Adding AMDGPU dialect wrapper for ROCDL transpose loads. #145395

lialan · 2025-06-23T19:28:21Z

1-to-1 mapping wrapper op.
Direct lowering from AMDGPU wrapper to ROCDL intrinsics.

llvmbot · 2025-06-23T19:28:52Z

@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-mlir-gpu

@llvm/pr-subscribers-mlir-amdgpu

Author: Alan Li (lialan)

Changes

1-to-1 mapping wrapper op.
Direct lowering from AMDGPU wrapper to ROCDL intrinsics.

Full diff: https://github.com/llvm/llvm-project/pull/145395.diff

3 Files Affected:

(modified) mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td (+21)
(modified) mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp (+45-2)
(modified) mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp (+18)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index d58558ac32884..003aff6d38da0 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -898,6 +898,27 @@ def AMDGPU_GatherToLDSOp :
   let hasVerifier = 1;
 }
 
+def AMDGPU_TransposeLoadOp :
+    AMDGPU_Op<"transpose_load", [SameVariadicOperandSize]>,
+    Arguments<(ins Arg<AnyMemRef, "buffer to transpose load from", [MemRead]>:$src, Variadic<Index>:$srcIndices)>,
+    Results<(outs MFMAInTypes:$dst)> {
+  let summary = "MLIR wrapper for CDNA Transpose Load instructions";
+  let description = [{
+    The `amdgpu.transpose_load` op is a wrapper around the `ds_read_tr` instructions.
+
+    Operands:
+    * `$src`: LDS memref to read from.
+    * `$srcIndices`: indices into `$src` to read from for this thread.
+    * `$dst`: target register this transpose load instruction will write to.
+
+    Note: Lowering is only supported on gfx950 and up.
+  }];
+  let assemblyFormat = [{
+    $src `[` $srcIndices `]` attr-dict `:` type($src) `->` type($dst)
+  }];
+  let hasVerifier = 1;
+}
+
 def AMDGPU_ScaledMFMAOp :
     AMDGPU_Op<"scaled_mfma", [AllTypesMatch<["destC", "destD"]>,
                         Pure]>,
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 700563460f525..62ed1d871bcfd 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1100,6 +1100,49 @@ struct WMMAOpLowering : public ConvertOpToLLVMPattern<WMMAOp> {
   }
 };
 
+struct TransposeLoadOpLowering
+    : public ConvertOpToLLVMPattern<TransposeLoadOp> {
+  TransposeLoadOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<TransposeLoadOp>(converter), chipset(chipset) {}
+
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(TransposeLoadOp op, TransposeLoadOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (chipset < kGfx950)
+      return op.emitOpError("Non-gfx950 chipset not supported");
+
+    Location loc = op.getLoc();
+    auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
+    Value srcPtr =
+        getStridedElementPtr(rewriter, loc, srcMemRefType, adaptor.getSrc(),
+                             (adaptor.getSrcIndices()));
+    auto elementTypeSize = cast<VectorType>(op.getDst().getType())
+                               .getElementType()
+                               .getIntOrFloatBitWidth();
+
+    // TODO: support ds_read_tr16_b64 intrinsic.
+    switch (elementTypeSize) {
+    case 4:
+      rewriter.replaceOpWithNewOp<ROCDL::ds_read_tr4_b64>(
+          op, op.getDst().getType(), srcPtr);
+      break;
+    case 8:
+      rewriter.replaceOpWithNewOp<ROCDL::ds_read_tr8_b64>(
+          op, op.getDst().getType(), srcPtr);
+      break;
+    case 16:
+      rewriter.replaceOpWithNewOp<ROCDL::ds_read_tr16_b64>(
+          op, op.getDst().getType(), srcPtr);
+      break;
+    default:
+      return op.emitOpError("Unsupported element size for transpose load");
+    }
+    return success();
+  }
+};
+
 struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern<GatherToLDSOp> {
   GatherToLDSOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
       : ConvertOpToLLVMPattern<GatherToLDSOp>(converter), chipset(chipset) {}
@@ -1749,7 +1792,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
            MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering,
            ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
            PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
-           PackedStochRoundFp8OpLowering, GatherToLDSOpLowering>(converter,
-                                                                 chipset);
+           PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
+           TransposeLoadOpLowering>(converter, chipset);
   patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
 }
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 0d0add3094666..00e9019b79647 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -524,6 +524,24 @@ LogicalResult GatherToLDSOp::verify() {
   return success();
 }
 
+LogicalResult TransposeLoadOp::verify() {
+  MemRefType srcType = cast<MemRefType>(getSrc().getType());
+
+  if (!hasWorkgroupMemorySpace(srcType.getMemorySpace()))
+    return emitOpError("source memory address space must be Workgroup");
+
+  // TODO: support 6-bit element type vectors.
+  auto transferType = dyn_cast<VectorType>(getDst().getType());
+  if (!transferType)
+    return emitOpError("destination type must be a vector type");
+  size_t transferSize =
+      transferType.getNumElements() * transferType.getElementTypeBitWidth();
+  if (transferSize != 64)
+    return emitOpError("Transfering type size must be 64 bits");
+
+  return success();
+}
+
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.cpp.inc"
 
 #define GET_ATTRDEF_CLASSES

llvmbot · 2025-06-23T19:28:52Z

@llvm/pr-subscribers-mlir

Author: Alan Li (lialan)

Changes

1-to-1 mapping wrapper op.
Direct lowering from AMDGPU wrapper to ROCDL intrinsics.

Full diff: https://github.com/llvm/llvm-project/pull/145395.diff

3 Files Affected:

(modified) mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td (+21)
(modified) mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp (+45-2)
(modified) mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp (+18)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index d58558ac32884..003aff6d38da0 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -898,6 +898,27 @@ def AMDGPU_GatherToLDSOp :
   let hasVerifier = 1;
 }
 
+def AMDGPU_TransposeLoadOp :
+    AMDGPU_Op<"transpose_load", [SameVariadicOperandSize]>,
+    Arguments<(ins Arg<AnyMemRef, "buffer to transpose load from", [MemRead]>:$src, Variadic<Index>:$srcIndices)>,
+    Results<(outs MFMAInTypes:$dst)> {
+  let summary = "MLIR wrapper for CDNA Transpose Load instructions";
+  let description = [{
+    The `amdgpu.transpose_load` op is a wrapper around the `ds_read_tr` instructions.
+
+    Operands:
+    * `$src`: LDS memref to read from.
+    * `$srcIndices`: indices into `$src` to read from for this thread.
+    * `$dst`: target register this transpose load instruction will write to.
+
+    Note: Lowering is only supported on gfx950 and up.
+  }];
+  let assemblyFormat = [{
+    $src `[` $srcIndices `]` attr-dict `:` type($src) `->` type($dst)
+  }];
+  let hasVerifier = 1;
+}
+
 def AMDGPU_ScaledMFMAOp :
     AMDGPU_Op<"scaled_mfma", [AllTypesMatch<["destC", "destD"]>,
                         Pure]>,
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 700563460f525..62ed1d871bcfd 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1100,6 +1100,49 @@ struct WMMAOpLowering : public ConvertOpToLLVMPattern<WMMAOp> {
   }
 };
 
+struct TransposeLoadOpLowering
+    : public ConvertOpToLLVMPattern<TransposeLoadOp> {
+  TransposeLoadOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<TransposeLoadOp>(converter), chipset(chipset) {}
+
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(TransposeLoadOp op, TransposeLoadOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (chipset < kGfx950)
+      return op.emitOpError("Non-gfx950 chipset not supported");
+
+    Location loc = op.getLoc();
+    auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
+    Value srcPtr =
+        getStridedElementPtr(rewriter, loc, srcMemRefType, adaptor.getSrc(),
+                             (adaptor.getSrcIndices()));
+    auto elementTypeSize = cast<VectorType>(op.getDst().getType())
+                               .getElementType()
+                               .getIntOrFloatBitWidth();
+
+    // TODO: support ds_read_tr16_b64 intrinsic.
+    switch (elementTypeSize) {
+    case 4:
+      rewriter.replaceOpWithNewOp<ROCDL::ds_read_tr4_b64>(
+          op, op.getDst().getType(), srcPtr);
+      break;
+    case 8:
+      rewriter.replaceOpWithNewOp<ROCDL::ds_read_tr8_b64>(
+          op, op.getDst().getType(), srcPtr);
+      break;
+    case 16:
+      rewriter.replaceOpWithNewOp<ROCDL::ds_read_tr16_b64>(
+          op, op.getDst().getType(), srcPtr);
+      break;
+    default:
+      return op.emitOpError("Unsupported element size for transpose load");
+    }
+    return success();
+  }
+};
+
 struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern<GatherToLDSOp> {
   GatherToLDSOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
       : ConvertOpToLLVMPattern<GatherToLDSOp>(converter), chipset(chipset) {}
@@ -1749,7 +1792,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
            MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering,
            ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
            PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
-           PackedStochRoundFp8OpLowering, GatherToLDSOpLowering>(converter,
-                                                                 chipset);
+           PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
+           TransposeLoadOpLowering>(converter, chipset);
   patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
 }
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 0d0add3094666..00e9019b79647 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -524,6 +524,24 @@ LogicalResult GatherToLDSOp::verify() {
   return success();
 }
 
+LogicalResult TransposeLoadOp::verify() {
+  MemRefType srcType = cast<MemRefType>(getSrc().getType());
+
+  if (!hasWorkgroupMemorySpace(srcType.getMemorySpace()))
+    return emitOpError("source memory address space must be Workgroup");
+
+  // TODO: support 6-bit element type vectors.
+  auto transferType = dyn_cast<VectorType>(getDst().getType());
+  if (!transferType)
+    return emitOpError("destination type must be a vector type");
+  size_t transferSize =
+      transferType.getNumElements() * transferType.getElementTypeBitWidth();
+  if (transferSize != 64)
+    return emitOpError("Transfering type size must be 64 bits");
+
+  return success();
+}
+
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.cpp.inc"
 
 #define GET_ATTRDEF_CLASSES

Copilot

Pull Request Overview

This PR introduces a new amdgpu.transpose_load wrapper operation with verification, TableGen definition, and direct lowering to ROCDL intrinsics.

Added TableGen op definition for TransposeLoadOp in the AMDGPU dialect.
Implemented TransposeLoadOp::verify() to enforce memory space and type constraints.
Created a conversion pattern to lower TransposeLoadOp to ROCDL ds_read_tr intrinsics.

Reviewed Changes

Copilot reviewed 3 out of 3 changed files in this pull request and generated 3 comments.

File	Description
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp	Added `TransposeLoadOp::verify()`
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td	Defined `AMDGPU_TransposeLoadOp` in TableGen
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp	Added `TransposeLoadOpLowering` and registered it

Comments suppressed due to low confidence (1)

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp:1796

There are no corresponding tests for the new TransposeLoadOp and its lowering. Consider adding unit tests to cover verification and lowering paths for different element sizes and unsupported cases.

           TransposeLoadOpLowering>(converter, chipset);

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

* 1-to-1 mapping wrapper op. * Direct lowering from AMDGPU wrapper to ROCDL intrinsics.

krzysz00

Missing AMDGPU dialect tests to show the op

Missing tests for the lowering

Maybe missing a narrow type emulation pattern

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

lialan · 2025-06-23T19:43:22Z

@krzysz00 My bad, forgot to include the test file in the PR. updated.

But what do we need for emulating narrow types?

krzysz00 · 2025-06-23T20:04:06Z

We'll want to make a pattern on this op that's analogous to the ones in mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp ... there's a reason I was working on an interface for this sort of thing that I never got around to.

In short, this pass turns memref<...x[small type> to memref<N x i8> and rewrites the indexing accordingly. We'll want to do the indexing adjustments, but keep returning the <L x {i4,f4E2M1FN, i6, ...}> directly

kuhar · 2025-06-23T20:15:14Z

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

+
+    Note: Lowering is only supported on gfx950 and up.
+  }];
+  let assemblyFormat = [{


I know other ops here don't provide examples, but I think it would be worth adding going forward -- I rely on these all the time

I like your idea. So I tried to add a very simple example to show the format of the op. In terms of the semantics of the instruction, it is too hard to explain in a few sentences so I wrote that "please refer to the actual document for detailed explanation".

Probably call out that you mean the CDNA4 ISA manual

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

mlir/test/Conversion/AMDGPUToROCDL/transpose_load.mlir

krzysz00 · 2025-06-23T23:14:05Z

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

+  F8E3M4          // 3 exponent, 4 mantissa
+]>;
+def F6Types : AnyTypeOf<[F6E2M3FN, F6E3M2FN]>;
+def TrLoadTypes : AnyTypeOf<[VectorOfLengthAndType<[4], [F16, AnyI<16>]>,


BF16 exists ... and also, we can probably leave this open and rely on a getIntOrFloatBitWidth() check in the verifier?

yeah, now it accepts any vectors and the verifier will serve as the checker.

mlir/test/Conversion/AMDGPUToROCDL/transpose_load.mlir

github-actions · 2025-06-24T01:43:29Z

✅ With the latest revision this PR passed the C/C++ code formatter.

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

krzysz00

Blocking because the lowering's got some footguns in it we need to get rid of

mlir/test/Conversion/AMDGPUToROCDL/transpose_load.mlir

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

mlir/test/Conversion/AMDGPUToROCDL/transpose_load.mlir

krzysz00 · 2025-06-24T16:13:27Z

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

+
+    Note: Lowering is only supported on gfx950 and up.
+  }];
+  let assemblyFormat = [{


Probably call out that you mean the CDNA4 ISA manual

Copilot

Pull Request Overview

Adds a 1:1 AMDGPU dialect wrapper for ROCDL transpose-load instructions, with direct lowering on gfx950+ and accompanying tests.

Introduce amdgpu.transpose_load op in the AMDGPU dialect (.td), implement semantic verification and conversion lowering to ROCDL intrinsics.
Add positive and negative MLIR tests for valid and unsupported element sizes in the conversion suite.
Wire up the new pattern in the AMDGPU→ROCDL conversion pipeline.

Reviewed Changes

Copilot reviewed 5 out of 5 changed files in this pull request and generated 1 comment.

Show a summary per file

File	Description
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td	Define `AMDGPU_TransposeLoadOp` with assembly format and docs
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp	Implement `TransposeLoadOp::verify()` for basic checks
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp	Add `TransposeLoadOpLowering` to lower to `ROCDL::ds_read_tr*`
mlir/test/Conversion/AMDGPUToROCDL/transpose_load.mlir	Positive tests for various supported bit-width patterns
mlir/test/Conversion/AMDGPUToROCDL/transpose_load_reject.mlir	Negative tests rejecting sub-byte element sizes

Comments suppressed due to low confidence (2)

mlir/test/Conversion/AMDGPUToROCDL/transpose_load_reject.mlir:1

No test covers the default fallback for unsupported element sizes (e.g., 12-bit); consider adding a case to exercise the Unsupported element size for transpose load path.

// RUN: not mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx950 2>&1 | FileCheck %s

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp:27

SmallDenseMap is used below but the header for it isn’t included; add #include "llvm/ADT/SmallDenseMap.h" to avoid compilation errors.

#include "llvm/ADT/DenseMap.h"

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

krzysz00

Let's add tests to mlir/test/Dialect/AMDGPU/ops.mlir and invalid.mlir

Code itself LGTM

krzysz00

Approved, thanks for making this happen!

lialan requested a review from Copilot June 23, 2025 19:28

llvmbot added backend:AMDGPU mlir:gpu mlir mlir:amdgpu labels Jun 23, 2025

Copilot AI reviewed Jun 23, 2025

View reviewed changes

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp Show resolved Hide resolved

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp Outdated Show resolved Hide resolved

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp Outdated Show resolved Hide resolved

[AMDGPU] Adding AMDGPU dialect wrapper for ROCDL transpose loads.

2009ede

* 1-to-1 mapping wrapper op. * Direct lowering from AMDGPU wrapper to ROCDL intrinsics.

lialan force-pushed the lialan/tr_load branch from a504722 to 2009ede Compare June 23, 2025 19:31

lialan requested review from krzysz00 and kuhar June 23, 2025 19:37

krzysz00 reviewed Jun 23, 2025

View reviewed changes

Adding a test file

50d19a6

Adding 6-bit loads.

087046a

kuhar reviewed Jun 23, 2025

View reviewed changes

lialan added 3 commits June 23, 2025 16:29

Adding support for 6-bit loadings.

fa30258

Adding check nots.

4259f63

Update the doc in the code.

c8157f0

krzysz00 reviewed Jun 23, 2025

View reviewed changes

lialan added 2 commits June 23, 2025 21:17

Update

bbb57ea

Adding loads from different value type.

60e2c56

kuhar reviewed Jun 24, 2025

View reviewed changes

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp Outdated Show resolved Hide resolved

Update

9bba79f

lialan force-pushed the lialan/tr_load branch from c1753b3 to 9bba79f Compare June 24, 2025 02:02

krzysz00 requested changes Jun 24, 2025

View reviewed changes

Use i6 instead of i32.

b5b4e6f

krzysz00 reviewed Jun 24, 2025

View reviewed changes

lialan added 3 commits June 24, 2025 12:20

Reject subbyte memrefs.

207f2f4

sub byte and byte elements use i32 as place holder.

db9b837

small update

32f0edf

lialan requested review from krzysz00, Copilot and kuhar June 24, 2025 17:37

another update

c13aec2

This comment was marked as outdated.

Sign in to view

lialan requested a review from Copilot June 24, 2025 17:48

Copilot AI reviewed Jun 24, 2025

View reviewed changes

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp Show resolved Hide resolved

krzysz00 reviewed Jun 25, 2025

View reviewed changes

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp Show resolved Hide resolved

krzysz00 reviewed Jun 25, 2025

View reviewed changes

lialan added 2 commits June 25, 2025 14:16

Adding op tests.

94f73d5

more invalid tests.

1f03a6d

lialan requested a review from krzysz00 June 25, 2025 19:48

krzysz00 approved these changes Jun 26, 2025

View reviewed changes

lialan added 2 commits June 25, 2025 22:32

Fix test.

4c3c64f

Merge branch 'main' into lialan/tr_load

c9ca046

lialan merged commit 3f3282c into llvm:main Jun 26, 2025
7 checks passed

lialan deleted the lialan/tr_load branch June 26, 2025 02:58

[AMDGPU] Adding AMDGPU dialect wrapper for ROCDL transpose loads. #145395

[AMDGPU] Adding AMDGPU dialect wrapper for ROCDL transpose loads. #145395

Conversation

lialan commented Jun 23, 2025

Uh oh!

llvmbot commented Jun 23, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jun 23, 2025

Uh oh!

Copilot AI left a comment

Choose a reason for hiding this comment

Pull Request Overview

Reviewed Changes

Uh oh!

Uh oh!

Uh oh!

Uh oh!

krzysz00 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

lialan commented Jun 23, 2025

Uh oh!

krzysz00 commented Jun 23, 2025

Uh oh!

kuhar Jun 23, 2025

Choose a reason for hiding this comment

Uh oh!

lialan Jun 24, 2025

Choose a reason for hiding this comment

Uh oh!

krzysz00 Jun 24, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

krzysz00 Jun 23, 2025

Choose a reason for hiding this comment

Uh oh!

lialan Jun 24, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

github-actions bot commented Jun 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

krzysz00 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

krzysz00 Jun 24, 2025

Choose a reason for hiding this comment

Uh oh!

This comment was marked as outdated.

Uh oh!

Copilot AI left a comment

Choose a reason for hiding this comment

Pull Request Overview

Reviewed Changes

Uh oh!

Uh oh!

Uh oh!

krzysz00 left a comment

Choose a reason for hiding this comment

Uh oh!

krzysz00 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Jun 23, 2025 •

edited

Loading

github-actions bot commented Jun 24, 2025 •

edited

Loading