intel · Jul 2, 2021
diff --git a/‎opencl/source/kernel/kernel.cpp
Lines changed: 83 additions & 0 deletions b/‎opencl/source/kernel/kernel.cpp
Lines changed: 83 additions & 0 deletions
diff --git a/‎opencl/source/kernel/kernel_extra.cpp
Lines changed: 1 addition & 11 deletions b/‎opencl/source/kernel/kernel_extra.cpp
Lines changed: 1 addition & 11 deletions
diff --git a/‎opencl/test/unit_test/kernel/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎opencl/test/unit_test/kernel/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp
Lines changed: 144 additions & 0 deletions b/‎opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp
Lines changed: 144 additions & 0 deletions
diff --git a/‎opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp
Lines changed: 303 additions & 5 deletions b/‎opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp
Lines changed: 303 additions & 5 deletions
diff --git a/‎opencl/test/unit_test/kernel/kernel_tests.cpp
Lines changed: 40 additions & 0 deletions b/‎opencl/test/unit_test/kernel/kernel_tests.cpp
Lines changed: 40 additions & 0 deletions
diff --git a/‎opencl/test/unit_test/test_files/igdrcl.config
Lines changed: 1 addition & 0 deletions b/‎opencl/test/unit_test/test_files/igdrcl.config
Lines changed: 1 addition & 0 deletions
diff --git a/‎shared/source/debug_settings/debug_variables_base.inl
Lines changed: 1 addition & 0 deletions b/‎shared/source/debug_settings/debug_variables_base.inl
Lines changed: 1 addition & 0 deletions
@@ -2670,4 +2670,87 @@ bool Kernel::areMultipleSubDevicesInContext() const {
     return context ? context->containsMultipleSubDevices(clDevice.getRootDeviceIndex()) : false;
 }
 
+void Kernel::reconfigureKernel() {
+    auto &kernelDescriptor = kernelInfo.kernelDescriptor;
+    if (kernelDescriptor.kernelAttributes.numGrfRequired == GrfConfig::LargeGrfNumber) {
+        maxKernelWorkGroupSize >>= 1;
+    }
+    this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites;
+    this->specialPipelineSelectMode = kernelDescriptor.extendedInfo.get() ? kernelDescriptor.extendedInfo->specialPipelineSelectModeRequired() : false;
+}
+bool Kernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
+    if (false == HwHelper::cacheFlushAfterWalkerSupported(commandQueue.getDevice().getHardwareInfo())) {
+        return false;
+    }
+
+    if (DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.get() != -1) {
+        return !!DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.get();
+    }
+
+    bool cmdQueueRequiresCacheFlush = commandQueue.getRequiresCacheFlushAfterWalker();
+    if (false == cmdQueueRequiresCacheFlush) {
+        return false;
+    }
+    if (commandQueue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable()) {
+        return false;
+    }
+    bool isMultiDevice = commandQueue.getContext().containsMultipleSubDevices(commandQueue.getDevice().getRootDeviceIndex());
+    if (false == isMultiDevice) {
+        return false;
+    }
+    bool isDefaultContext = (commandQueue.getContext().peekContextType() == ContextType::CONTEXT_TYPE_DEFAULT);
+    if (true == isDefaultContext) {
+        return false;
+    }
+
+    if (getProgram()->getGlobalSurface(commandQueue.getDevice().getRootDeviceIndex()) != nullptr) {
+        return true;
+    }
+    if (svmAllocationsRequireCacheFlush) {
+        return true;
+    }
+    size_t args = kernelArgRequiresCacheFlush.size();
+    for (size_t i = 0; i < args; i++) {
+        if (kernelArgRequiresCacheFlush[i] != nullptr) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool Kernel::requiresLimitedWorkgroupSize() const {
+    if (!this->isBuiltIn) {
+        return false;
+    }
+    if (this->auxTranslationDirection != AuxTranslationDirection::None) {
+        return false;
+    }
+
+    //if source is buffer in local memory, no need for limited workgroup
+    if (this->kernelInfo.getArgDescriptorAt(0).is<ArgDescriptor::ArgTPointer>()) {
+        if (this->getKernelArgInfo(0).object) {
+            auto rootDeviceIndex = getDevice().getRootDeviceIndex();
+            auto buffer = castToObject<Buffer>(this->getKernelArgInfo(0u).object);
+            if (buffer && buffer->getGraphicsAllocation(rootDeviceIndex)->getMemoryPool() == MemoryPool::LocalMemory) {
+                return false;
+            }
+        }
+    }
+
+    //if we are reading from image no need for limited workgroup
+    if (this->kernelInfo.getArgDescriptorAt(0).is<ArgDescriptor::ArgTImage>()) {
+        return false;
+    }
+
+    return true;
+}
+
+void Kernel::updateAuxTranslationRequired() {
+    if (DebugManager.flags.EnableStatelessCompression.get()) {
+        if (hasDirectStatelessAccessToHostMemory() || hasIndirectStatelessAccessToHostMemory()) {
+            setAuxTranslationRequired(true);
+        }
+    }
+}
+
 } // namespace NEO
@@ -11,11 +11,7 @@
 #include "opencl/source/kernel/kernel.h"
 
 namespace NEO {
-bool Kernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
-    return false;
-}
-void Kernel::reconfigureKernel() {
-}
+
 int Kernel::setKernelThreadArbitrationPolicy(uint32_t policy) {
     auto hwInfo = clDevice.getHardwareInfo();
     auto &hwHelper = NEO::ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
@@ -39,14 +35,8 @@ bool Kernel::requiresPerDssBackedBuffer() const {
     return DebugManager.flags.ForcePerDssBackedBufferProgramming.get();
 }
 
-bool Kernel::requiresLimitedWorkgroupSize() const {
-    return this->isBuiltIn;
-}
-
 int32_t Kernel::setAdditionalKernelExecInfoWithParam(uint32_t paramName, size_t paramValueSize, const void *paramValue) {
     return CL_INVALID_VALUE;
 }
 
-void Kernel::updateAuxTranslationRequired() {
-}
 } // namespace NEO
@@ -16,7 +16,7 @@ set(IGDRCL_SRCS_tests_kernel
     ${CMAKE_CURRENT_SOURCE_DIR}/kernel_arg_info_tests.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/kernel_arg_pipe_tests.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/kernel_arg_svm_tests.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/kernel_cache_flush_requirements_tests.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/kernel_cache_flush_requirements_tests.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/kernel_info_cl_tests.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/kernel_image_arg_tests.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/kernel_immediate_arg_tests.cpp
 
@@ -461,6 +461,150 @@ TEST_F(KernelArgBufferTest, whenSettingAuxTranslationRequiredThenIsAuxTranslatio
     }
 }
 
+TEST_F(KernelArgBufferTest, givenSetArgBufferOnKernelWithDirectStatelessAccessToHostMemoryWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnTrue) {
+    DebugManagerStateRestore debugRestorer;
+    DebugManager.flags.EnableStatelessCompression.set(1);
+
+    MockBuffer buffer;
+    buffer.getGraphicsAllocation(mockRootDeviceIndex)->setAllocationType(GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY);
+
+    auto val = (cl_mem)&buffer;
+    auto pVal = &val;
+
+    auto retVal = pKernel->setArg(0, sizeof(cl_mem *), pVal);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    EXPECT_TRUE(pKernel->hasDirectStatelessAccessToHostMemory());
+
+    EXPECT_FALSE(pKernel->isAuxTranslationRequired());
+
+    pKernel->updateAuxTranslationRequired();
+
+    EXPECT_TRUE(pKernel->isAuxTranslationRequired());
+}
+
+TEST_F(KernelArgBufferTest, givenSetArgBufferOnKernelWithNoDirectStatelessAccessToHostMemoryWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnFalse) {
+    DebugManagerStateRestore debugRestorer;
+    DebugManager.flags.EnableStatelessCompression.set(1);
+
+    MockBuffer buffer;
+    buffer.getGraphicsAllocation(mockRootDeviceIndex)->setAllocationType(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED);
+
+    auto val = (cl_mem)&buffer;
+    auto pVal = &val;
+
+    auto retVal = pKernel->setArg(0, sizeof(cl_mem *), pVal);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    EXPECT_FALSE(pKernel->hasDirectStatelessAccessToHostMemory());
+
+    EXPECT_FALSE(pKernel->isAuxTranslationRequired());
+
+    pKernel->updateAuxTranslationRequired();
+
+    EXPECT_FALSE(pKernel->isAuxTranslationRequired());
+}
+
+TEST_F(KernelArgBufferTest, givenSetArgSvmAllocOnKernelWithDirectStatelessAccessToHostMemoryWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnTrue) {
+    DebugManagerStateRestore debugRestorer;
+    DebugManager.flags.EnableStatelessCompression.set(1);
+
+    char data[128];
+    void *ptr = &data;
+    MockGraphicsAllocation gfxAllocation(ptr, 128);
+    gfxAllocation.setAllocationType(GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY);
+
+    auto retVal = pKernel->setArgSvmAlloc(0, ptr, &gfxAllocation);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    EXPECT_TRUE(pKernel->hasDirectStatelessAccessToHostMemory());
+
+    EXPECT_FALSE(pKernel->isAuxTranslationRequired());
+
+    pKernel->updateAuxTranslationRequired();
+
+    EXPECT_TRUE(pKernel->isAuxTranslationRequired());
+}
+
+TEST_F(KernelArgBufferTest, givenSetArgSvmAllocOnKernelWithNoDirectStatelessAccessToHostMemoryWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnFalse) {
+    DebugManagerStateRestore debugRestorer;
+    DebugManager.flags.EnableStatelessCompression.set(1);
+
+    char data[128];
+    void *ptr = &data;
+    MockGraphicsAllocation gfxAllocation(ptr, 128);
+    gfxAllocation.setAllocationType(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED);
+
+    auto retVal = pKernel->setArgSvmAlloc(0, ptr, &gfxAllocation);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    EXPECT_FALSE(pKernel->hasDirectStatelessAccessToHostMemory());
+
+    EXPECT_FALSE(pKernel->isAuxTranslationRequired());
+
+    pKernel->updateAuxTranslationRequired();
+
+    EXPECT_FALSE(pKernel->isAuxTranslationRequired());
+}
+
+TEST_F(KernelArgBufferTest, givenSetUnifiedMemoryExecInfoOnKernelWithNoIndirectStatelessAccessWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnFalse) {
+    DebugManagerStateRestore debugRestorer;
+    DebugManager.flags.EnableStatelessCompression.set(1);
+
+    pKernelInfo->hasIndirectStatelessAccess = false;
+
+    MockGraphicsAllocation gfxAllocation;
+    gfxAllocation.setAllocationType(GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY);
+
+    pKernel->setUnifiedMemoryExecInfo(&gfxAllocation);
+
+    EXPECT_FALSE(pKernel->hasIndirectStatelessAccessToHostMemory());
+
+    EXPECT_FALSE(pKernel->isAuxTranslationRequired());
+
+    pKernel->updateAuxTranslationRequired();
+
+    EXPECT_FALSE(pKernel->isAuxTranslationRequired());
+}
+
+TEST_F(KernelArgBufferTest, givenSetUnifiedMemoryExecInfoOnKernelWithIndirectStatelessAccessWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnTrueForHostMemoryAllocation) {
+    DebugManagerStateRestore debugRestorer;
+    DebugManager.flags.EnableStatelessCompression.set(1);
+
+    pKernelInfo->hasIndirectStatelessAccess = true;
+
+    const auto allocationTypes = {GraphicsAllocation::AllocationType::BUFFER,
+                                  GraphicsAllocation::AllocationType::BUFFER_COMPRESSED,
+                                  GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY};
+
+    MockGraphicsAllocation gfxAllocation;
+
+    for (const auto type : allocationTypes) {
+        gfxAllocation.setAllocationType(type);
+
+        pKernel->setUnifiedMemoryExecInfo(&gfxAllocation);
+
+        if (type == GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY) {
+            EXPECT_TRUE(pKernel->hasIndirectStatelessAccessToHostMemory());
+        } else {
+            EXPECT_FALSE(pKernel->hasIndirectStatelessAccessToHostMemory());
+        }
+
+        EXPECT_FALSE(pKernel->isAuxTranslationRequired());
+
+        pKernel->updateAuxTranslationRequired();
+
+        if (type == GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY) {
+            EXPECT_TRUE(pKernel->isAuxTranslationRequired());
+        } else {
+            EXPECT_FALSE(pKernel->isAuxTranslationRequired());
+        }
+
+        pKernel->clearUnifiedMemoryExecInfo();
+        pKernel->setAuxTranslationRequired(false);
+    }
+}
+
 class KernelArgBufferFixtureBindless : public KernelArgBufferFixture {
   public:
     void SetUp() {
 
@@ -6,23 +6,321 @@
  */
 
 #include "shared/test/common/helpers/debug_manager_state_restore.h"
+#include "shared/test/common/helpers/variable_backup.h"
 #include "shared/test/common/mocks/mock_graphics_allocation.h"
 
 #include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
 #include "opencl/test/unit_test/fixtures/context_fixture.h"
+#include "opencl/test/unit_test/fixtures/platform_fixture.h"
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
 #include "opencl/test/unit_test/mocks/mock_context.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "opencl/test/unit_test/mocks/mock_program.h"
+#include "test.h"
 
-using namespace NEO;
+namespace NEO {
 
-TEST(KernelWithCacheFlushTests, givenDeviceWhichDoesntRequireCacheFlushWhenCheckIfKernelRequireFlushThenReturnedFalse) {
-    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+class KernelWithCacheFlushTests : public PlatformFixture, public testing::TestWithParam<std::tuple<const char *, const char *>> {
+  public:
+    void SetUp() override {
+    }
+    void TearDown() override {
+    }
+    void initializePlatform() {
+        PlatformFixture::SetUp();
+    }
+    void clearPlatform() {
+        PlatformFixture::TearDown();
+    }
+};
+TEST_F(KernelWithCacheFlushTests, givenDeviceWhichDoesntRequireCacheFlushWhenCheckIfKernelRequireFlushThenReturnedFalse) {
+    initializePlatform();
+    auto device = pPlatform->getClDevice(0);
 
     auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
-    MockContext mockContext(device.get());
-    MockCommandQueue queue;
+    MockContext mockContext(device);
+    MockCommandQueue queue(mockContext);
     bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(queue);
     EXPECT_FALSE(flushRequired);
+    clearPlatform();
 }
+TEST_F(KernelWithCacheFlushTests, givenQueueWhichDoesntRequireCacheFlushWhenCheckIfKernelRequireFlushThenReturnedFalse) {
+    initializePlatform();
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    auto device = pPlatform->getClDevice(0);
+
+    auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
+    MockContext mockContext(device);
+    MockCommandQueue queue(mockContext);
+
+    bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(queue);
+    EXPECT_FALSE(flushRequired);
+    clearPlatform();
+}
+TEST_F(KernelWithCacheFlushTests, givenCacheFlushForAllQueuesDisabledWhenCheckIfKernelRequireFlushThenReturnedFalse) {
+    initializePlatform();
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(0);
+    auto device = pPlatform->getClDevice(0);
+
+    auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
+    MockContext mockContext(device);
+    MockCommandQueue queue(mockContext);
+    bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(queue);
+
+    EXPECT_FALSE(flushRequired);
+    clearPlatform();
+}
+HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushForMultiEngineEnabledWhenCheckIfKernelRequireFlushThenReturnedFalse) {
+    initializePlatform();
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    auto device = pPlatform->getClDevice(0);
+
+    auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
+    MockContext mockContext(device);
+    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
+    cmdQ->requiresCacheFlushAfterWalker = true;
+    auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
+    ultCsr.multiOsContextCapable = true;
+    bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
+
+    EXPECT_FALSE(flushRequired);
+    clearPlatform();
+}
+
+HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushForSingleDeviceProgramWhenCheckIfKernelRequireFlushThenReturnedFalse) {
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.CreateMultipleSubDevices.set(1);
+    initializePlatform();
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    auto device = pPlatform->getClDevice(0);
+
+    auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
+    MockContext mockContext(device);
+    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
+    auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
+    ultCsr.multiOsContextCapable = false;
+    cmdQ->requiresCacheFlushAfterWalker = true;
+    bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
+
+    EXPECT_FALSE(flushRequired);
+    clearPlatform();
+}
+
+HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushForDefaultTypeContextWhenCheckIfKernelRequireFlushThenReturnedFalse) {
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    uint32_t numDevices = 2;
+    DebugManager.flags.CreateMultipleSubDevices.set(numDevices);
+    initializePlatform();
+    auto device = pPlatform->getClDevice(0);
+
+    auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
+    MockContext mockContext(device);
+    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
+    cmdQ->requiresCacheFlushAfterWalker = true;
+    auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
+    ultCsr.multiOsContextCapable = false;
+    bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
+
+    EXPECT_FALSE(flushRequired);
+    clearPlatform();
+}
+HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushWithNullGlobalSurfaceWhenCheckIfKernelRequireFlushThenReturnedFalse) {
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    uint32_t numDevices = 2;
+    DebugManager.flags.CreateMultipleSubDevices.set(numDevices);
+    initializePlatform();
+    auto device = pPlatform->getClDevice(0);
+
+    auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
+    MockContext mockContext(device);
+    mockContext.contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
+    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
+    cmdQ->requiresCacheFlushAfterWalker = true;
+    auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
+    ultCsr.multiOsContextCapable = false;
+    bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
+
+    EXPECT_FALSE(flushRequired);
+    clearPlatform();
+}
+HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushWithGlobalSurfaceWhenCheckIfKernelRequireFlushThenReturnedTrue) {
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    uint32_t numDevices = 2;
+    DebugManager.flags.CreateMultipleSubDevices.set(numDevices);
+    initializePlatform();
+    auto device = pPlatform->getClDevice(0);
+
+    auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
+    MockContext mockContext(device);
+    mockContext.contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
+
+    void *allocPtr = reinterpret_cast<void *>(static_cast<uintptr_t>(6 * MemoryConstants::pageSize));
+    MockGraphicsAllocation globalAllocation{allocPtr, MemoryConstants::pageSize * 2};
+    mockKernel->mockProgram->setGlobalSurface(&globalAllocation);
+
+    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
+    cmdQ->requiresCacheFlushAfterWalker = true;
+    auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
+    ultCsr.multiOsContextCapable = false;
+    bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
+
+    EXPECT_TRUE(flushRequired);
+    mockKernel->mockProgram->setGlobalSurface(nullptr);
+    clearPlatform();
+}
+
+HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAddRequiredCommands, IsAtLeastXeHpCore) {
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.CreateMultipleSubDevices.set(2);
+
+    initializePlatform();
+
+    if (!pPlatform->getClDevice(0)->getHardwareInfo().capabilityTable.supportCacheFlushAfterWalker) {
+        clearPlatform();
+        GTEST_SKIP();
+    }
+
+    auto device = pPlatform->getClDevice(0);
+
+    auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
+    MockContext mockContext(device);
+    mockContext.contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
+    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
+
+    CsrDependencies csrDeps;
+    DispatchInfo dispatchInfo;
+    MultiDispatchInfo multiDispatchInfo(mockKernel->mockKernel);
+    dispatchInfo.setKernel(mockKernel->mockKernel);
+    dispatchInfo.setNumberOfWorkgroups({1, 1, 1});
+    dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1});
+    multiDispatchInfo.push(dispatchInfo);
+
+    size_t initialSize = 0;
+    size_t sizeWithCacheFlush = 0;
+    size_t expectedDiff = sizeof(typename FamilyType::PIPE_CONTROL);
+    if constexpr (FamilyType::isUsingL3Control) {
+        expectedDiff += sizeof(typename FamilyType::L3_CONTROL) + sizeof(typename FamilyType::L3_FLUSH_ADDRESS_RANGE);
+    }
+
+    {
+        EXPECT_FALSE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ));
+
+        initialSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false);
+    }
+
+    {
+        DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+        void *allocPtr = reinterpret_cast<void *>(static_cast<uintptr_t>(6 * MemoryConstants::pageSize));
+        MockGraphicsAllocation globalAllocation{allocPtr, MemoryConstants::pageSize * 2};
+        mockKernel->mockProgram->setGlobalSurface(&globalAllocation);
+
+        cmdQ->requiresCacheFlushAfterWalker = true;
+        auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
+        ultCsr.multiOsContextCapable = false;
+        EXPECT_TRUE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ));
+
+        sizeWithCacheFlush = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false);
+    }
+
+    EXPECT_EQ(initialSize + expectedDiff, sizeWithCacheFlush);
+
+    mockKernel->mockProgram->setGlobalSurface(nullptr);
+    clearPlatform();
+}
+
+HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushWithAllocationsRequireCacheFlushFlagOnWhenCheckIfKernelRequireFlushThenReturnedTrue) {
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    uint32_t numDevices = 2;
+    DebugManager.flags.CreateMultipleSubDevices.set(numDevices);
+    initializePlatform();
+    auto device = pPlatform->getClDevice(0);
+
+    auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
+    MockContext mockContext(device);
+    mockContext.contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
+    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
+    cmdQ->requiresCacheFlushAfterWalker = true;
+    auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
+    ultCsr.multiOsContextCapable = false;
+    mockKernel->mockKernel->svmAllocationsRequireCacheFlush = true;
+    bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
+
+    EXPECT_TRUE(flushRequired);
+    clearPlatform();
+}
+HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushWithAllocationsWhichRequireCacheFlushWhenCheckIfKernelRequireFlushThenReturnedTrue) {
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    uint32_t numDevices = 2;
+    DebugManager.flags.CreateMultipleSubDevices.set(numDevices);
+    initializePlatform();
+    auto device = pPlatform->getClDevice(0);
+
+    auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
+    MockContext mockContext(device);
+    mockContext.contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
+    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
+    cmdQ->requiresCacheFlushAfterWalker = true;
+    auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
+    ultCsr.multiOsContextCapable = false;
+    mockKernel->mockKernel->svmAllocationsRequireCacheFlush = false;
+    mockKernel->mockKernel->kernelArgRequiresCacheFlush.resize(2);
+    MockGraphicsAllocation cacheRequiringAllocation;
+    mockKernel->mockKernel->kernelArgRequiresCacheFlush[1] = &cacheRequiringAllocation;
+    bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
+
+    EXPECT_TRUE(flushRequired);
+    clearPlatform();
+}
+
+HWTEST_F(KernelWithCacheFlushTests,
+         givenEnableCacheFlushAfterWalkerForAllQueuesFlagSetWhenCheckIfKernelRequierFlushThenTrueIsAlwaysReturned) {
+    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
+    DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(1);
+    MockGraphicsAllocation cacheRequiringAllocation;
+
+    for (auto isMultiEngine : ::testing::Bool()) {
+        for (auto isMultiDevice : ::testing::Bool()) {
+            for (auto isDefaultContext : ::testing::Bool()) {
+                for (auto svmAllocationRequiresCacheFlush : ::testing::Bool()) {
+                    for (auto kernelArgRequiresCacheFlush : ::testing::Bool()) {
+                        auto deviceCount = (isMultiDevice ? 2 : 0);
+                        auto contextType =
+                            (isDefaultContext ? ContextType::CONTEXT_TYPE_DEFAULT : ContextType::CONTEXT_TYPE_SPECIALIZED);
+                        GraphicsAllocation *kernelArg = (kernelArgRequiresCacheFlush ? &cacheRequiringAllocation : nullptr);
+
+                        DebugManager.flags.CreateMultipleSubDevices.set(deviceCount);
+                        initializePlatform();
+
+                        auto device = pPlatform->getClDevice(0);
+                        MockContext mockContext(device);
+                        mockContext.contextType = contextType;
+                        auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
+                        cmdQ->requiresCacheFlushAfterWalker = true;
+                        auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
+                        ultCsr.multiOsContextCapable = isMultiEngine;
+
+                        auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
+                        mockKernel->mockKernel->svmAllocationsRequireCacheFlush = svmAllocationRequiresCacheFlush;
+                        mockKernel->mockKernel->kernelArgRequiresCacheFlush.resize(1);
+                        mockKernel->mockKernel->kernelArgRequiresCacheFlush[0] = kernelArg;
+
+                        auto flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
+                        EXPECT_TRUE(flushRequired);
+                        clearPlatform();
+                    }
+                }
+            }
+        }
+    }
+}
+} // namespace NEO
@@ -3171,3 +3171,43 @@ TEST_F(KernelTests, givenKernelWithSimdEqual1WhenKernelCreatedThenMaxWorgGroupSi
     EXPECT_LT(pKernel->getMaxKernelWorkGroupSize(), deviceMaxWorkGroupSize);
     EXPECT_EQ(pKernel->getMaxKernelWorkGroupSize(), maxThreadsPerWG);
 }
+
+struct KernelLargeGrfTests : Test<ClDeviceFixture> {
+    void SetUp() override {
+        ClDeviceFixture::SetUp();
+        program = std::make_unique<MockProgram>(toClDeviceVector(*pClDevice));
+        pKernelInfo = std::make_unique<KernelInfo>();
+        pKernelInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize = 64;
+    }
+
+    void TearDown() override {
+        ClDeviceFixture::TearDown();
+    }
+
+    std::unique_ptr<MockProgram> program;
+    std::unique_ptr<KernelInfo> pKernelInfo;
+    SPatchExecutionEnvironment executionEnvironment = {};
+};
+
+HWTEST_F(KernelLargeGrfTests, GivenLargeGrfWhenGettingMaxWorkGroupSizeThenCorrectValueReturned) {
+    pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 32;
+    pKernelInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize = 4;
+    pKernelInfo->kernelDescriptor.payloadMappings.implicitArgs.maxWorkGroupSize = 0;
+    {
+        MockKernel kernel(program.get(), *pKernelInfo, *pClDevice);
+
+        pKernelInfo->kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::LargeGrfNumber - 1;
+        EXPECT_EQ(CL_SUCCESS, kernel.initialize());
+        EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, *kernel.maxWorkGroupSizeForCrossThreadData);
+        EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, kernel.maxKernelWorkGroupSize);
+    }
+
+    {
+        MockKernel kernel(program.get(), *pKernelInfo, *pClDevice);
+
+        pKernelInfo->kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::LargeGrfNumber;
+        EXPECT_EQ(CL_SUCCESS, kernel.initialize());
+        EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize >> 1, *kernel.maxWorkGroupSizeForCrossThreadData);
+        EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize >> 1, kernel.maxKernelWorkGroupSize);
+    }
+}
@@ -299,5 +299,6 @@ EnableUserFenceUseCtxId = -1
 EnableResourceTags = 0
 SetKmdWaitTimeout = -1
 OverrideNotifyEnableForTagUpdatePostSync = -1
+EnableCacheFlushAfterWalkerForAllQueues = -1
 Force32BitDriverSupport = -1
 OverrideCmdQueueSynchronousMode = -1
@@ -206,6 +206,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, MinHwThreadsUnoccupied, 0, "If not zero then max
 DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushEveryEnqueueCount, -1, "If greater than 0, driver performs implicit flush every N submissions.")
 DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForNewResource, -1, "-1: platform specific, 0: force disable, 1: force enable")
 DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForIdleGpu, -1, "-1: platform specific, 0: force disable, 1: force enable")
+DECLARE_DEBUG_VARIABLE(int32_t, EnableCacheFlushAfterWalkerForAllQueues, -1, "Enable cache flush after walker even if queue doesn't require it")
 
 /*DIRECT SUBMISSION FLAGS*/
 DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD")