Correctly set return size of 2d block read

jgu222 · igcbot · commit 384af597d26e · 2025-04-09T21:30:58.000+02:00
block 2d read requires that each block is multiple of GRF size,
thus the total size must be multiple of GRF size.

This change makes sure the size is multiple of GRF size. This
is required for correctness.
diff --git a/IGC/Compiler/CISACodeGen/CShader.cpp b/IGC/Compiler/CISACodeGen/CShader.cpp
@@ -1721,6 +1721,73 @@ uint CShader::GetNbElementAndMask(llvm::Value* value, uint32_t& mask)
             // Number elements = {num GRFs} * {num DWords in GRF} = {num GRFs} * 8;
             return int_cast<unsigned int>(cast<ConstantInt>(numGRFs)->getZExtValue() * 8);
         }
+        case GenISAIntrinsic::GenISA_LSC2DBlockRead:
+        case GenISAIntrinsic::GenISA_LSC2DBlockReadAddrPayload:
+        {
+            // 2D block read requires its block size to be multiple of GRF size.
+            uint32_t eltBits, blkWidth, blkHeight, numBlks;
+            bool isTranspose, isVnni;
+            if (IID == GenISAIntrinsic::GenISA_LSC2DBlockRead)
+            {
+                eltBits = (uint32_t)cast<ConstantInt>(inst->getOperand(6))->getZExtValue();
+                blkWidth = (uint32_t)cast<ConstantInt>(inst->getOperand(7))->getZExtValue();
+                blkHeight = (uint32_t)cast<ConstantInt>(inst->getOperand(8))->getZExtValue();
+                numBlks = (uint32_t)cast<ConstantInt>(inst->getOperand(9))->getZExtValue();
+                isTranspose = (uint)cast<ConstantInt>(inst->getOperand(10))->getZExtValue();
+                isVnni = (uint)cast<ConstantInt>(inst->getOperand(11))->getZExtValue();
+            }
+            else
+            {
+                IGC_ASSERT(IID == GenISAIntrinsic::GenISA_LSC2DBlockReadAddrPayload);
+                eltBits = (uint32_t)cast<ConstantInt>(inst->getOperand(3))->getZExtValue();
+                blkWidth = (uint32_t)cast<ConstantInt>(inst->getOperand(4))->getZExtValue();
+                blkHeight = (uint32_t)cast<ConstantInt>(inst->getOperand(5))->getZExtValue();
+                numBlks = (uint32_t)cast<ConstantInt>(inst->getOperand(6))->getZExtValue();
+                isTranspose = cast<ConstantInt>(inst->getOperand(7))->getZExtValue();
+                isVnni = cast<ConstantInt>(inst->getOperand(8))->getZExtValue();
+            }
+
+            // Width is padded to the next power-of-2 value
+            uint32_t blkWidthCeil = (uint32_t)PowerOf2Ceil(blkWidth);
+            if (blkWidthCeil != blkWidth)
+            {
+                m_ctx->EmitWarning("Block2D: block width not power of 2, zero padded.");
+            }
+            uint32_t blkHeightCeil = blkHeight;
+            if (isTranspose)
+            {
+                blkHeightCeil = (uint32_t)PowerOf2Ceil(blkHeight);
+                if (blkHeightCeil != blkHeight)
+                {
+                    m_ctx->EmitWarning("Block2D: transpose block height not power of 2, zero padded.");
+                }
+            }
+            if (isVnni)
+            {
+                IGC_ASSERT(eltBits == 16 || eltBits == 8);
+                uint32_t N = 32 / eltBits;
+                uint32_t origVal = blkHeightCeil;
+                blkHeightCeil = (uint32_t)divideCeil(blkHeightCeil, N) * N;
+                if (blkHeightCeil != origVal)
+                {
+                    m_ctx->EmitWarning("Block2D: transform block height not multiple "
+                        "of N (32/eltBits), zero padded.");
+                }
+            }
+            uint32_t blkBits = blkWidthCeil * blkHeightCeil * eltBits;
+            uint32_t numGRFsPerBlk = (uint32_t)divideCeil(blkBits, getGRFSize() * 8);
+            uint32_t blkBitsCeil = getGRFSize() * 8 * numGRFsPerBlk;
+            if (blkBitsCeil != blkBits)
+            {
+                m_ctx->EmitWarning("Block2D: block size not multiple of GRF size, zero padded.");
+            }
+            uint32_t numGRFs = numGRFsPerBlk * numBlks;
+            VISA_Type visaTy = GetType(inst->getType());
+            uint32_t eltTyBytes = CEncoder::GetCISADataTypeSize(visaTy);
+            uint32_t nbElement = (uint32_t)divideCeil(numGRFs * getGRFSize(), eltTyBytes);
+
+            return nbElement;
+        }
         default:
             break;
         }
diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
@@ -24710,21 +24710,6 @@ void EmitPass::emitLSC2DBlockOperation(llvm::GenIntrinsicInst* inst)
     bool isVnni = (uint)cast<ConstantInt>(inst->getOperand(11))->getZExtValue();
 
     CVariable* destination = m_destination;
-    if (numBlocksV == 2 && blockHeight == 1 &&
-        !isPrefetch &&
-        elemSizeInBits * blockWidth == 256 &&
-        m_currShader->m_Platform->getPlatformInfo().eProductFamily >= IGFX_PVC)
-    {
-        // m1 v2 block read has grf aligned V blocks,
-        // variable should be 2x larger
-        destination = m_currShader->GetNewVariable(
-            m_destination->GetNumberElement() * 2,
-            m_destination->GetType(),
-            m_destination->GetAlign(),
-            m_destination->IsUniform(),
-            m_destination->GetNumberInstance(),
-            CName::NONE);
-    }
     LSC_CACHE_OPTS cacheOpts = translateLSCCacheControlsFromValue(inst->getOperand(12), isRead);
 
     if (isRead == false)
@@ -24755,20 +24740,6 @@ void EmitPass::emitLSC2DBlockOperation(llvm::GenIntrinsicInst* inst)
             pFlatImagePitch,
             cacheOpts);
         m_encoder->Push();
-
-
-        if (isRead &&
-            !isPrefetch &&
-            destination != m_destination)
-        {
-            // m1 v2 block read
-            m_encoder->Copy(m_destination, destination);
-            m_encoder->Push();
-            m_encoder->SetSrcSubVar(0, 1);
-            m_encoder->SetDstSubReg(m_destination->GetNumberElement() / 2);
-            m_encoder->Copy(m_destination, destination);
-            m_encoder->Push();
-        }
         return;
     }
 
diff --git a/IGC/Compiler/tests/EmitVISAPass/block2d_addrpayload_return_size.ll b/IGC/Compiler/tests/EmitVISAPass/block2d_addrpayload_return_size.ll
@@ -0,0 +1,80 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2023 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+; REQUIRES: regkeys, llvm-14-plus
+
+; RUN: igc_opt --opaque-pointers -platformpvc -igc-emit-visa -regkey EnableDebugging -simd-mode 16 %s | FileCheck %s
+; ------------------------------------------------
+; EmitVISAPass
+; ------------------------------------------------
+
+;; Test read payload size for LSC2DBlockReadAddrPayload intrinsics
+;;
+
+define spir_kernel void @test(i64 %b, i32 %x, i32 %y, i32 %k) {
+entry:
+;
+; case 0:   u32_m7k1 transpose
+;
+; CHECK: Block2D: transpose block height not power of 2, zero padded.
+;
+  %ap = call i32* @llvm.genx.GenISA.LSC2DBlockCreateAddrPayload.p0i32(i64 %b, i32 0, i32 1023, i32 1023, i32 %x, i32 %y, i32 1, i32 7, i32 1)
+  %val = call i32 @llvm.genx.GenISA.LSC2DBlockReadAddrPayload.p0i32(i32* %ap, i32 0, i32 0, i32 32, i32 1, i32 7, i32 1, i1 true, i1 false, i32 0)
+  %tmp0 = add i64 %b, 1024
+  %addr0 = inttoptr i64 %tmp0 to i32 addrspace(1)*
+  store i32 %val, i32 addrspace(1)* %addr0, align 4
+
+
+;
+; case 1: u16_m15k2 vnni transform
+;
+; CHECK: Block2D: block size not multiple of GRF size, zero padded
+; CHECK: Block2D: transform block height not multiple of N (32/eltBits), zero padded
+;
+  %ap1 = call i32* @llvm.genx.GenISA.LSC2DBlockCreateAddrPayload.p0i32(i64 %b, i32 0, i32 1023, i32 1023, i32 %x, i32 %y, i32 2, i32 15, i32 1)
+  %val1 = call <2 x i16> @llvm.genx.GenISA.LSC2DBlockReadAddrPayload.p0i32.v2i16(i32* %ap, i32 0, i32 0, i32 16, i32 2, i32 15, i32 1, i1 false, i1 true, i32 0)
+  %tmp1 = add i64 %b, 2048
+  %val10 = bitcast <2 x i16> %val1 to i32
+  %addr1 = inttoptr i64 %tmp1 to i32 addrspace(1)*
+  store i32 %val10, i32 addrspace(1)* %addr1, align 4
+
+  ret void
+}
+
+declare i32* @llvm.genx.GenISA.LSC2DBlockCreateAddrPayload.p0i32(i64, i32, i32, i32, i32, i32, i32, i32, i32)
+declare <2 x i16> @llvm.genx.GenISA.LSC2DBlockReadAddrPayload.p0i32.v2i16(i32*, i32, i32, i32, i32, i32, i32, i1, i1, i32)
+declare i32 @llvm.genx.GenISA.LSC2DBlockReadAddrPayload.p0i32(i32*, i32, i32, i32, i32, i32, i32, i1, i1, i32)
+
+
+!IGCMetadata = !{!0}
+!igc.functions = !{!21}
+
+!0 = !{!"ModuleMD", !1}
+!1 = !{!"FuncMD", !2, !3}
+!2 = !{!"FuncMDMap[0]", void (i64, i32, i32, i32)* @test}
+!3 = !{!"FuncMDValue[0]", !4, !17}
+!4 = !{!"resAllocMD", !5}
+!5 = !{!"argAllocMDList", !6, !10, !11, !14, !15, !16}
+!6 = !{!"argAllocMDListVec[0]", !7, !8, !9}
+!7 = !{!"type", i32 0}
+!8 = !{!"extensionType", i32 -1}
+!9 = !{!"indexType", i32 -1}
+!10 = !{!"argAllocMDListVec[1]", !7, !8, !9}
+!11 = !{!"argAllocMDListVec[2]", !12, !8, !13}
+!12 = !{!"type", i32 1}
+!13 = !{!"indexType", i32 0}
+!14 = !{!"argAllocMDListVec[3]", !7, !8, !9}
+!15 = !{!"argAllocMDListVec[4]", !7, !8, !9}
+!16 = !{!"argAllocMDListVec[5]", !7, !8, !9}
+!17 = !{!"m_OpenCLArgTypeQualifiers", !18, !19, !20}
+!18 = !{!"m_OpenCLArgTypeQualifiersVec[0]", !""}
+!19 = !{!"m_OpenCLArgTypeQualifiersVec[1]", !""}
+!20 = !{!"m_OpenCLArgTypeQualifiersVec[2]", !""}
+!21 = !{void (i64, i32, i32, i32)* @test, !22}
+!22 = !{!23, !24}
+!23 = !{!"function_type", i32 0}
+!24 = !{!"sub_group_size", i32 16}
diff --git a/IGC/Compiler/tests/EmitVISAPass/block2d_return_size.ll b/IGC/Compiler/tests/EmitVISAPass/block2d_return_size.ll
@@ -0,0 +1,107 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2025 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+; This test runs vISA EmitPass and checks if 2d block read needs zero padding.
+; It checks warning messages
+
+; REQUIRES: llvm-14-plus, regkeys
+
+; RUN: igc_opt --opaque-pointers -platformpvc -igc-emit-visa %s -regkey EnableDebugging \
+; RUN:   -simd-mode 16 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
+target triple = "spir64-unknown-unknown"
+
+;
+; Function Attrs: convergent nounwind null_pointer_is_valid
+define spir_kernel void @test_2dblock_read_zero_padding(i16 addrspace(1)* align 2 %dst, i64 %base, i32 %widthm1, i32 %heightm1, i32 %pitchm1, i32 %x, i32 %y, <8 x i32> %r0, <8 x i32> %payloadHeader, <3 x i32> %enqueuedLocalSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ, i32 %bufferOffset) #1 {
+entry:
+%ibase = ptrtoint i16 addrspace(1)* %dst to i64
+  %lid = zext i16 %localIdX to i64
+  %tmp = add i64  %ibase, %lid
+;
+; case 0 u32_m1k8v2
+; CHECK:  warning: Block2D: block size not multiple of GRF size, zero padded
+;
+  %res0 = call <2 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v2i32(i64 %base, i32 %widthm1, i32 %heightm1, i32 %pitchm1, i32 %x, i32 %y, i32 32, i32 8, i32 1, i32 2, i1 false, i1 false, i32 0)
+  %tmp0 = add i64 %tmp, 16
+  %addr0 = inttoptr i64 %tmp0 to <2 x i32> addrspace(1)*
+  store <2 x i32> %res0, <2 x i32> addrspace(1)* %addr0, align 8
+
+;
+; case 1 u32_m7k2 transpose
+; CHECK:  warning: Block2D: transpose block height not power of 2, zero padded
+;
+  %res1 = call i32 @llvm.genx.GenISA.LSC2DBlockRead.i32(i64 %base, i32 %widthm1, i32 %heightm1, i32 %pitchm1, i32 %x, i32 %y, i32 32, i32 2, i32 7, i32 1, i1 true, i1 false, i32 0)
+  %tmp1 = add i64 %tmp, 128
+  %addr1 = inttoptr i64 %tmp1 to i32 addrspace(1)*
+  store i32 %res1, i32 addrspace(1)* %addr1, align 8
+
+;
+; case 2 u8_m29k2 vnni transform
+; CHECK: warning: Block2D: transform block height not multiple of N (32/eltBits), zero padded.
+;
+  %res2 = call <2 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v2i16(i64 %base, i32 %widthm1, i32 %heightm1, i32 %pitchm1, i32 %x, i32 %y, i32 8, i32 2, i32 29, i32 1, i1 false, i1 true, i32 0)
+  %tmp2 = add i64 %tmp, 512
+  %res20 = bitcast <2 x i16> %res2 to i32
+  %addr2 = inttoptr i64 %tmp2 to i32 addrspace(1)*
+  store i32 %res20, i32 addrspace(1)* %addr2, align 8
+
+  ret void
+}
+
+declare <2 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v2i16(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32)
+declare <2 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v2i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32)
+declare i32 @llvm.genx.GenISA.LSC2DBlockRead.i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32)
+
+!igc.functions = !{!3}
+!IGCMetadata = !{!16}
+
+!3 = !{void (i16 addrspace(1)*, i64, i32, i32, i32, i32, i32, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32)* @test_2dblock_read_zero_padding, !4}
+!4 = !{!5, !6, !15}
+!5 = !{!"function_type", i32 0}
+!6 = !{!"implicit_arg_desc", !7, !8, !9, !10, !11, !12, !13}
+!7 = !{i32 0}
+!8 = !{i32 1}
+!9 = !{i32 7}
+!10 = !{i32 8}
+!11 = !{i32 9}
+!12 = !{i32 10}
+!13 = !{i32 15, !14}
+!14 = !{!"explicit_arg_num", i32 0}
+!15 = !{!"sub_group_size", i32 16}
+!16 = !{!"ModuleMD", !131}
+!131 = !{!"FuncMD", !132, !133}
+!132 = !{!"FuncMDMap[0]", void (i16 addrspace(1)*, i64, i32, i32, i32, i32, i32, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32)* @test_2dblock_read_zero_padding}
+!133 = !{!"FuncMDValue[0]", !166, !237}
+!166 = !{!"resAllocMD", !170}
+!170 = !{!"argAllocMDList", !171, !175, !176, !177, !178, !179, !180, !181, !182, !183, !184, !185, !186, !187}
+!171 = !{!"argAllocMDListVec[0]", !172, !173, !174}
+!172 = !{!"type", i32 0}
+!173 = !{!"extensionType", i32 -1}
+!174 = !{!"indexType", i32 -1}
+!175 = !{!"argAllocMDListVec[1]", !172, !173, !174}
+!176 = !{!"argAllocMDListVec[2]", !172, !173, !174}
+!177 = !{!"argAllocMDListVec[3]", !172, !173, !174}
+!178 = !{!"argAllocMDListVec[4]", !172, !173, !174}
+!179 = !{!"argAllocMDListVec[5]", !172, !173, !174}
+!180 = !{!"argAllocMDListVec[6]", !172, !173, !174}
+!181 = !{!"argAllocMDListVec[7]", !172, !173, !174}
+!182 = !{!"argAllocMDListVec[8]", !172, !173, !174}
+!183 = !{!"argAllocMDListVec[9]", !172, !173, !174}
+!184 = !{!"argAllocMDListVec[10]", !172, !173, !174}
+!185 = !{!"argAllocMDListVec[11]", !172, !173, !174}
+!186 = !{!"argAllocMDListVec[12]", !172, !173, !174}
+!187 = !{!"argAllocMDListVec[13]", !172, !173, !174}
+!237 = !{!"m_OpenCLArgTypeQualifiers", !238, !239, !240, !241, !242, !243, !244}
+!238 = !{!"m_OpenCLArgTypeQualifiersVec[0]", !""}
+!239 = !{!"m_OpenCLArgTypeQualifiersVec[1]", !""}
+!240 = !{!"m_OpenCLArgTypeQualifiersVec[2]", !""}
+!241 = !{!"m_OpenCLArgTypeQualifiersVec[3]", !""}
+!242 = !{!"m_OpenCLArgTypeQualifiersVec[4]", !""}
+!243 = !{!"m_OpenCLArgTypeQualifiersVec[5]", !""}
+!244 = !{!"m_OpenCLArgTypeQualifiersVec[6]", !""}