Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 64eb82e

Browse files
BartoszDunajskiCompute-Runtime-Automation
authored andcommittedJul 2, 2021
Add Kernel restrictions
Signed-off-by: Bartosz Dunajski <[email protected]>
1 parent aed3fad commit 64eb82e

File tree

8 files changed

+574
-17
lines changed

8 files changed

+574
-17
lines changed
 

‎opencl/source/kernel/kernel.cpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2670,4 +2670,87 @@ bool Kernel::areMultipleSubDevicesInContext() const {
26702670
return context ? context->containsMultipleSubDevices(clDevice.getRootDeviceIndex()) : false;
26712671
}
26722672

2673+
void Kernel::reconfigureKernel() {
2674+
auto &kernelDescriptor = kernelInfo.kernelDescriptor;
2675+
if (kernelDescriptor.kernelAttributes.numGrfRequired == GrfConfig::LargeGrfNumber) {
2676+
maxKernelWorkGroupSize >>= 1;
2677+
}
2678+
this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites;
2679+
this->specialPipelineSelectMode = kernelDescriptor.extendedInfo.get() ? kernelDescriptor.extendedInfo->specialPipelineSelectModeRequired() : false;
2680+
}
2681+
bool Kernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
2682+
if (false == HwHelper::cacheFlushAfterWalkerSupported(commandQueue.getDevice().getHardwareInfo())) {
2683+
return false;
2684+
}
2685+
2686+
if (DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.get() != -1) {
2687+
return !!DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.get();
2688+
}
2689+
2690+
bool cmdQueueRequiresCacheFlush = commandQueue.getRequiresCacheFlushAfterWalker();
2691+
if (false == cmdQueueRequiresCacheFlush) {
2692+
return false;
2693+
}
2694+
if (commandQueue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable()) {
2695+
return false;
2696+
}
2697+
bool isMultiDevice = commandQueue.getContext().containsMultipleSubDevices(commandQueue.getDevice().getRootDeviceIndex());
2698+
if (false == isMultiDevice) {
2699+
return false;
2700+
}
2701+
bool isDefaultContext = (commandQueue.getContext().peekContextType() == ContextType::CONTEXT_TYPE_DEFAULT);
2702+
if (true == isDefaultContext) {
2703+
return false;
2704+
}
2705+
2706+
if (getProgram()->getGlobalSurface(commandQueue.getDevice().getRootDeviceIndex()) != nullptr) {
2707+
return true;
2708+
}
2709+
if (svmAllocationsRequireCacheFlush) {
2710+
return true;
2711+
}
2712+
size_t args = kernelArgRequiresCacheFlush.size();
2713+
for (size_t i = 0; i < args; i++) {
2714+
if (kernelArgRequiresCacheFlush[i] != nullptr) {
2715+
return true;
2716+
}
2717+
}
2718+
return false;
2719+
}
2720+
2721+
bool Kernel::requiresLimitedWorkgroupSize() const {
2722+
if (!this->isBuiltIn) {
2723+
return false;
2724+
}
2725+
if (this->auxTranslationDirection != AuxTranslationDirection::None) {
2726+
return false;
2727+
}
2728+
2729+
//if source is buffer in local memory, no need for limited workgroup
2730+
if (this->kernelInfo.getArgDescriptorAt(0).is<ArgDescriptor::ArgTPointer>()) {
2731+
if (this->getKernelArgInfo(0).object) {
2732+
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
2733+
auto buffer = castToObject<Buffer>(this->getKernelArgInfo(0u).object);
2734+
if (buffer && buffer->getGraphicsAllocation(rootDeviceIndex)->getMemoryPool() == MemoryPool::LocalMemory) {
2735+
return false;
2736+
}
2737+
}
2738+
}
2739+
2740+
//if we are reading from image no need for limited workgroup
2741+
if (this->kernelInfo.getArgDescriptorAt(0).is<ArgDescriptor::ArgTImage>()) {
2742+
return false;
2743+
}
2744+
2745+
return true;
2746+
}
2747+
2748+
void Kernel::updateAuxTranslationRequired() {
2749+
if (DebugManager.flags.EnableStatelessCompression.get()) {
2750+
if (hasDirectStatelessAccessToHostMemory() || hasIndirectStatelessAccessToHostMemory()) {
2751+
setAuxTranslationRequired(true);
2752+
}
2753+
}
2754+
}
2755+
26732756
} // namespace NEO

‎opencl/source/kernel/kernel_extra.cpp

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,7 @@
1111
#include "opencl/source/kernel/kernel.h"
1212

1313
namespace NEO {
14-
bool Kernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
15-
return false;
16-
}
17-
void Kernel::reconfigureKernel() {
18-
}
14+
1915
int Kernel::setKernelThreadArbitrationPolicy(uint32_t policy) {
2016
auto hwInfo = clDevice.getHardwareInfo();
2117
auto &hwHelper = NEO::ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
@@ -39,14 +35,8 @@ bool Kernel::requiresPerDssBackedBuffer() const {
3935
return DebugManager.flags.ForcePerDssBackedBufferProgramming.get();
4036
}
4137

42-
bool Kernel::requiresLimitedWorkgroupSize() const {
43-
return this->isBuiltIn;
44-
}
45-
4638
int32_t Kernel::setAdditionalKernelExecInfoWithParam(uint32_t paramName, size_t paramValueSize, const void *paramValue) {
4739
return CL_INVALID_VALUE;
4840
}
4941

50-
void Kernel::updateAuxTranslationRequired() {
51-
}
5242
} // namespace NEO

‎opencl/test/unit_test/kernel/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ set(IGDRCL_SRCS_tests_kernel
1616
${CMAKE_CURRENT_SOURCE_DIR}/kernel_arg_info_tests.cpp
1717
${CMAKE_CURRENT_SOURCE_DIR}/kernel_arg_pipe_tests.cpp
1818
${CMAKE_CURRENT_SOURCE_DIR}/kernel_arg_svm_tests.cpp
19-
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/kernel_cache_flush_requirements_tests.cpp
19+
${CMAKE_CURRENT_SOURCE_DIR}/kernel_cache_flush_requirements_tests.cpp
2020
${CMAKE_CURRENT_SOURCE_DIR}/kernel_info_cl_tests.cpp
2121
${CMAKE_CURRENT_SOURCE_DIR}/kernel_image_arg_tests.cpp
2222
${CMAKE_CURRENT_SOURCE_DIR}/kernel_immediate_arg_tests.cpp

‎opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,150 @@ TEST_F(KernelArgBufferTest, whenSettingAuxTranslationRequiredThenIsAuxTranslatio
461461
}
462462
}
463463

464+
TEST_F(KernelArgBufferTest, givenSetArgBufferOnKernelWithDirectStatelessAccessToHostMemoryWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnTrue) {
465+
DebugManagerStateRestore debugRestorer;
466+
DebugManager.flags.EnableStatelessCompression.set(1);
467+
468+
MockBuffer buffer;
469+
buffer.getGraphicsAllocation(mockRootDeviceIndex)->setAllocationType(GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY);
470+
471+
auto val = (cl_mem)&buffer;
472+
auto pVal = &val;
473+
474+
auto retVal = pKernel->setArg(0, sizeof(cl_mem *), pVal);
475+
EXPECT_EQ(CL_SUCCESS, retVal);
476+
477+
EXPECT_TRUE(pKernel->hasDirectStatelessAccessToHostMemory());
478+
479+
EXPECT_FALSE(pKernel->isAuxTranslationRequired());
480+
481+
pKernel->updateAuxTranslationRequired();
482+
483+
EXPECT_TRUE(pKernel->isAuxTranslationRequired());
484+
}
485+
486+
TEST_F(KernelArgBufferTest, givenSetArgBufferOnKernelWithNoDirectStatelessAccessToHostMemoryWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnFalse) {
487+
DebugManagerStateRestore debugRestorer;
488+
DebugManager.flags.EnableStatelessCompression.set(1);
489+
490+
MockBuffer buffer;
491+
buffer.getGraphicsAllocation(mockRootDeviceIndex)->setAllocationType(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED);
492+
493+
auto val = (cl_mem)&buffer;
494+
auto pVal = &val;
495+
496+
auto retVal = pKernel->setArg(0, sizeof(cl_mem *), pVal);
497+
EXPECT_EQ(CL_SUCCESS, retVal);
498+
499+
EXPECT_FALSE(pKernel->hasDirectStatelessAccessToHostMemory());
500+
501+
EXPECT_FALSE(pKernel->isAuxTranslationRequired());
502+
503+
pKernel->updateAuxTranslationRequired();
504+
505+
EXPECT_FALSE(pKernel->isAuxTranslationRequired());
506+
}
507+
508+
TEST_F(KernelArgBufferTest, givenSetArgSvmAllocOnKernelWithDirectStatelessAccessToHostMemoryWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnTrue) {
509+
DebugManagerStateRestore debugRestorer;
510+
DebugManager.flags.EnableStatelessCompression.set(1);
511+
512+
char data[128];
513+
void *ptr = &data;
514+
MockGraphicsAllocation gfxAllocation(ptr, 128);
515+
gfxAllocation.setAllocationType(GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY);
516+
517+
auto retVal = pKernel->setArgSvmAlloc(0, ptr, &gfxAllocation);
518+
EXPECT_EQ(CL_SUCCESS, retVal);
519+
520+
EXPECT_TRUE(pKernel->hasDirectStatelessAccessToHostMemory());
521+
522+
EXPECT_FALSE(pKernel->isAuxTranslationRequired());
523+
524+
pKernel->updateAuxTranslationRequired();
525+
526+
EXPECT_TRUE(pKernel->isAuxTranslationRequired());
527+
}
528+
529+
TEST_F(KernelArgBufferTest, givenSetArgSvmAllocOnKernelWithNoDirectStatelessAccessToHostMemoryWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnFalse) {
530+
DebugManagerStateRestore debugRestorer;
531+
DebugManager.flags.EnableStatelessCompression.set(1);
532+
533+
char data[128];
534+
void *ptr = &data;
535+
MockGraphicsAllocation gfxAllocation(ptr, 128);
536+
gfxAllocation.setAllocationType(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED);
537+
538+
auto retVal = pKernel->setArgSvmAlloc(0, ptr, &gfxAllocation);
539+
EXPECT_EQ(CL_SUCCESS, retVal);
540+
541+
EXPECT_FALSE(pKernel->hasDirectStatelessAccessToHostMemory());
542+
543+
EXPECT_FALSE(pKernel->isAuxTranslationRequired());
544+
545+
pKernel->updateAuxTranslationRequired();
546+
547+
EXPECT_FALSE(pKernel->isAuxTranslationRequired());
548+
}
549+
550+
TEST_F(KernelArgBufferTest, givenSetUnifiedMemoryExecInfoOnKernelWithNoIndirectStatelessAccessWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnFalse) {
551+
DebugManagerStateRestore debugRestorer;
552+
DebugManager.flags.EnableStatelessCompression.set(1);
553+
554+
pKernelInfo->hasIndirectStatelessAccess = false;
555+
556+
MockGraphicsAllocation gfxAllocation;
557+
gfxAllocation.setAllocationType(GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY);
558+
559+
pKernel->setUnifiedMemoryExecInfo(&gfxAllocation);
560+
561+
EXPECT_FALSE(pKernel->hasIndirectStatelessAccessToHostMemory());
562+
563+
EXPECT_FALSE(pKernel->isAuxTranslationRequired());
564+
565+
pKernel->updateAuxTranslationRequired();
566+
567+
EXPECT_FALSE(pKernel->isAuxTranslationRequired());
568+
}
569+
570+
TEST_F(KernelArgBufferTest, givenSetUnifiedMemoryExecInfoOnKernelWithIndirectStatelessAccessWhenUpdateAuxTranslationRequiredIsCalledThenIsAuxTranslationRequiredShouldReturnTrueForHostMemoryAllocation) {
571+
DebugManagerStateRestore debugRestorer;
572+
DebugManager.flags.EnableStatelessCompression.set(1);
573+
574+
pKernelInfo->hasIndirectStatelessAccess = true;
575+
576+
const auto allocationTypes = {GraphicsAllocation::AllocationType::BUFFER,
577+
GraphicsAllocation::AllocationType::BUFFER_COMPRESSED,
578+
GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY};
579+
580+
MockGraphicsAllocation gfxAllocation;
581+
582+
for (const auto type : allocationTypes) {
583+
gfxAllocation.setAllocationType(type);
584+
585+
pKernel->setUnifiedMemoryExecInfo(&gfxAllocation);
586+
587+
if (type == GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY) {
588+
EXPECT_TRUE(pKernel->hasIndirectStatelessAccessToHostMemory());
589+
} else {
590+
EXPECT_FALSE(pKernel->hasIndirectStatelessAccessToHostMemory());
591+
}
592+
593+
EXPECT_FALSE(pKernel->isAuxTranslationRequired());
594+
595+
pKernel->updateAuxTranslationRequired();
596+
597+
if (type == GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY) {
598+
EXPECT_TRUE(pKernel->isAuxTranslationRequired());
599+
} else {
600+
EXPECT_FALSE(pKernel->isAuxTranslationRequired());
601+
}
602+
603+
pKernel->clearUnifiedMemoryExecInfo();
604+
pKernel->setAuxTranslationRequired(false);
605+
}
606+
}
607+
464608
class KernelArgBufferFixtureBindless : public KernelArgBufferFixture {
465609
public:
466610
void SetUp() {

‎opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp

Lines changed: 303 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,321 @@
66
*/
77

88
#include "shared/test/common/helpers/debug_manager_state_restore.h"
9+
#include "shared/test/common/helpers/variable_backup.h"
910
#include "shared/test/common/mocks/mock_graphics_allocation.h"
1011

1112
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
1213
#include "opencl/test/unit_test/fixtures/context_fixture.h"
14+
#include "opencl/test/unit_test/fixtures/platform_fixture.h"
1315
#include "opencl/test/unit_test/mocks/mock_command_queue.h"
1416
#include "opencl/test/unit_test/mocks/mock_context.h"
1517
#include "opencl/test/unit_test/mocks/mock_kernel.h"
1618
#include "opencl/test/unit_test/mocks/mock_program.h"
19+
#include "test.h"
1720

18-
using namespace NEO;
21+
namespace NEO {
1922

20-
TEST(KernelWithCacheFlushTests, givenDeviceWhichDoesntRequireCacheFlushWhenCheckIfKernelRequireFlushThenReturnedFalse) {
21-
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
23+
class KernelWithCacheFlushTests : public PlatformFixture, public testing::TestWithParam<std::tuple<const char *, const char *>> {
24+
public:
25+
void SetUp() override {
26+
}
27+
void TearDown() override {
28+
}
29+
void initializePlatform() {
30+
PlatformFixture::SetUp();
31+
}
32+
void clearPlatform() {
33+
PlatformFixture::TearDown();
34+
}
35+
};
36+
TEST_F(KernelWithCacheFlushTests, givenDeviceWhichDoesntRequireCacheFlushWhenCheckIfKernelRequireFlushThenReturnedFalse) {
37+
initializePlatform();
38+
auto device = pPlatform->getClDevice(0);
2239

2340
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
24-
MockContext mockContext(device.get());
25-
MockCommandQueue queue;
41+
MockContext mockContext(device);
42+
MockCommandQueue queue(mockContext);
2643
bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(queue);
2744
EXPECT_FALSE(flushRequired);
45+
clearPlatform();
2846
}
47+
TEST_F(KernelWithCacheFlushTests, givenQueueWhichDoesntRequireCacheFlushWhenCheckIfKernelRequireFlushThenReturnedFalse) {
48+
initializePlatform();
49+
DebugManagerStateRestore dbgRestore;
50+
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
51+
auto device = pPlatform->getClDevice(0);
52+
53+
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
54+
MockContext mockContext(device);
55+
MockCommandQueue queue(mockContext);
56+
57+
bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(queue);
58+
EXPECT_FALSE(flushRequired);
59+
clearPlatform();
60+
}
61+
TEST_F(KernelWithCacheFlushTests, givenCacheFlushForAllQueuesDisabledWhenCheckIfKernelRequireFlushThenReturnedFalse) {
62+
initializePlatform();
63+
DebugManagerStateRestore dbgRestore;
64+
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
65+
DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(0);
66+
auto device = pPlatform->getClDevice(0);
67+
68+
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
69+
MockContext mockContext(device);
70+
MockCommandQueue queue(mockContext);
71+
bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(queue);
72+
73+
EXPECT_FALSE(flushRequired);
74+
clearPlatform();
75+
}
76+
HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushForMultiEngineEnabledWhenCheckIfKernelRequireFlushThenReturnedFalse) {
77+
initializePlatform();
78+
DebugManagerStateRestore dbgRestore;
79+
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
80+
auto device = pPlatform->getClDevice(0);
81+
82+
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
83+
MockContext mockContext(device);
84+
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
85+
cmdQ->requiresCacheFlushAfterWalker = true;
86+
auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
87+
ultCsr.multiOsContextCapable = true;
88+
bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
89+
90+
EXPECT_FALSE(flushRequired);
91+
clearPlatform();
92+
}
93+
94+
HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushForSingleDeviceProgramWhenCheckIfKernelRequireFlushThenReturnedFalse) {
95+
DebugManagerStateRestore dbgRestore;
96+
DebugManager.flags.CreateMultipleSubDevices.set(1);
97+
initializePlatform();
98+
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
99+
auto device = pPlatform->getClDevice(0);
100+
101+
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
102+
MockContext mockContext(device);
103+
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
104+
auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
105+
ultCsr.multiOsContextCapable = false;
106+
cmdQ->requiresCacheFlushAfterWalker = true;
107+
bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
108+
109+
EXPECT_FALSE(flushRequired);
110+
clearPlatform();
111+
}
112+
113+
HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushForDefaultTypeContextWhenCheckIfKernelRequireFlushThenReturnedFalse) {
114+
DebugManagerStateRestore dbgRestore;
115+
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
116+
uint32_t numDevices = 2;
117+
DebugManager.flags.CreateMultipleSubDevices.set(numDevices);
118+
initializePlatform();
119+
auto device = pPlatform->getClDevice(0);
120+
121+
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
122+
MockContext mockContext(device);
123+
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
124+
cmdQ->requiresCacheFlushAfterWalker = true;
125+
auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
126+
ultCsr.multiOsContextCapable = false;
127+
bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
128+
129+
EXPECT_FALSE(flushRequired);
130+
clearPlatform();
131+
}
132+
HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushWithNullGlobalSurfaceWhenCheckIfKernelRequireFlushThenReturnedFalse) {
133+
DebugManagerStateRestore dbgRestore;
134+
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
135+
uint32_t numDevices = 2;
136+
DebugManager.flags.CreateMultipleSubDevices.set(numDevices);
137+
initializePlatform();
138+
auto device = pPlatform->getClDevice(0);
139+
140+
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
141+
MockContext mockContext(device);
142+
mockContext.contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
143+
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
144+
cmdQ->requiresCacheFlushAfterWalker = true;
145+
auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
146+
ultCsr.multiOsContextCapable = false;
147+
bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
148+
149+
EXPECT_FALSE(flushRequired);
150+
clearPlatform();
151+
}
152+
HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushWithGlobalSurfaceWhenCheckIfKernelRequireFlushThenReturnedTrue) {
153+
DebugManagerStateRestore dbgRestore;
154+
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
155+
uint32_t numDevices = 2;
156+
DebugManager.flags.CreateMultipleSubDevices.set(numDevices);
157+
initializePlatform();
158+
auto device = pPlatform->getClDevice(0);
159+
160+
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
161+
MockContext mockContext(device);
162+
mockContext.contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
163+
164+
void *allocPtr = reinterpret_cast<void *>(static_cast<uintptr_t>(6 * MemoryConstants::pageSize));
165+
MockGraphicsAllocation globalAllocation{allocPtr, MemoryConstants::pageSize * 2};
166+
mockKernel->mockProgram->setGlobalSurface(&globalAllocation);
167+
168+
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
169+
cmdQ->requiresCacheFlushAfterWalker = true;
170+
auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
171+
ultCsr.multiOsContextCapable = false;
172+
bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
173+
174+
EXPECT_TRUE(flushRequired);
175+
mockKernel->mockProgram->setGlobalSurface(nullptr);
176+
clearPlatform();
177+
}
178+
179+
HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAddRequiredCommands, IsAtLeastXeHpCore) {
180+
DebugManagerStateRestore dbgRestore;
181+
DebugManager.flags.CreateMultipleSubDevices.set(2);
182+
183+
initializePlatform();
184+
185+
if (!pPlatform->getClDevice(0)->getHardwareInfo().capabilityTable.supportCacheFlushAfterWalker) {
186+
clearPlatform();
187+
GTEST_SKIP();
188+
}
189+
190+
auto device = pPlatform->getClDevice(0);
191+
192+
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
193+
MockContext mockContext(device);
194+
mockContext.contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
195+
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
196+
197+
CsrDependencies csrDeps;
198+
DispatchInfo dispatchInfo;
199+
MultiDispatchInfo multiDispatchInfo(mockKernel->mockKernel);
200+
dispatchInfo.setKernel(mockKernel->mockKernel);
201+
dispatchInfo.setNumberOfWorkgroups({1, 1, 1});
202+
dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1});
203+
multiDispatchInfo.push(dispatchInfo);
204+
205+
size_t initialSize = 0;
206+
size_t sizeWithCacheFlush = 0;
207+
size_t expectedDiff = sizeof(typename FamilyType::PIPE_CONTROL);
208+
if constexpr (FamilyType::isUsingL3Control) {
209+
expectedDiff += sizeof(typename FamilyType::L3_CONTROL) + sizeof(typename FamilyType::L3_FLUSH_ADDRESS_RANGE);
210+
}
211+
212+
{
213+
EXPECT_FALSE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ));
214+
215+
initialSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false);
216+
}
217+
218+
{
219+
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
220+
void *allocPtr = reinterpret_cast<void *>(static_cast<uintptr_t>(6 * MemoryConstants::pageSize));
221+
MockGraphicsAllocation globalAllocation{allocPtr, MemoryConstants::pageSize * 2};
222+
mockKernel->mockProgram->setGlobalSurface(&globalAllocation);
223+
224+
cmdQ->requiresCacheFlushAfterWalker = true;
225+
auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
226+
ultCsr.multiOsContextCapable = false;
227+
EXPECT_TRUE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ));
228+
229+
sizeWithCacheFlush = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false);
230+
}
231+
232+
EXPECT_EQ(initialSize + expectedDiff, sizeWithCacheFlush);
233+
234+
mockKernel->mockProgram->setGlobalSurface(nullptr);
235+
clearPlatform();
236+
}
237+
238+
HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushWithAllocationsRequireCacheFlushFlagOnWhenCheckIfKernelRequireFlushThenReturnedTrue) {
239+
DebugManagerStateRestore dbgRestore;
240+
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
241+
uint32_t numDevices = 2;
242+
DebugManager.flags.CreateMultipleSubDevices.set(numDevices);
243+
initializePlatform();
244+
auto device = pPlatform->getClDevice(0);
245+
246+
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
247+
MockContext mockContext(device);
248+
mockContext.contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
249+
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
250+
cmdQ->requiresCacheFlushAfterWalker = true;
251+
auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
252+
ultCsr.multiOsContextCapable = false;
253+
mockKernel->mockKernel->svmAllocationsRequireCacheFlush = true;
254+
bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
255+
256+
EXPECT_TRUE(flushRequired);
257+
clearPlatform();
258+
}
259+
HWTEST_F(KernelWithCacheFlushTests, givenCacheFlushWithAllocationsWhichRequireCacheFlushWhenCheckIfKernelRequireFlushThenReturnedTrue) {
260+
DebugManagerStateRestore dbgRestore;
261+
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
262+
uint32_t numDevices = 2;
263+
DebugManager.flags.CreateMultipleSubDevices.set(numDevices);
264+
initializePlatform();
265+
auto device = pPlatform->getClDevice(0);
266+
267+
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
268+
MockContext mockContext(device);
269+
mockContext.contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
270+
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
271+
cmdQ->requiresCacheFlushAfterWalker = true;
272+
auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
273+
ultCsr.multiOsContextCapable = false;
274+
mockKernel->mockKernel->svmAllocationsRequireCacheFlush = false;
275+
mockKernel->mockKernel->kernelArgRequiresCacheFlush.resize(2);
276+
MockGraphicsAllocation cacheRequiringAllocation;
277+
mockKernel->mockKernel->kernelArgRequiresCacheFlush[1] = &cacheRequiringAllocation;
278+
bool flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
279+
280+
EXPECT_TRUE(flushRequired);
281+
clearPlatform();
282+
}
283+
284+
HWTEST_F(KernelWithCacheFlushTests,
285+
givenEnableCacheFlushAfterWalkerForAllQueuesFlagSetWhenCheckIfKernelRequierFlushThenTrueIsAlwaysReturned) {
286+
DebugManagerStateRestore dbgRestore;
287+
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
288+
DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.set(1);
289+
MockGraphicsAllocation cacheRequiringAllocation;
290+
291+
for (auto isMultiEngine : ::testing::Bool()) {
292+
for (auto isMultiDevice : ::testing::Bool()) {
293+
for (auto isDefaultContext : ::testing::Bool()) {
294+
for (auto svmAllocationRequiresCacheFlush : ::testing::Bool()) {
295+
for (auto kernelArgRequiresCacheFlush : ::testing::Bool()) {
296+
auto deviceCount = (isMultiDevice ? 2 : 0);
297+
auto contextType =
298+
(isDefaultContext ? ContextType::CONTEXT_TYPE_DEFAULT : ContextType::CONTEXT_TYPE_SPECIALIZED);
299+
GraphicsAllocation *kernelArg = (kernelArgRequiresCacheFlush ? &cacheRequiringAllocation : nullptr);
300+
301+
DebugManager.flags.CreateMultipleSubDevices.set(deviceCount);
302+
initializePlatform();
303+
304+
auto device = pPlatform->getClDevice(0);
305+
MockContext mockContext(device);
306+
mockContext.contextType = contextType;
307+
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&mockContext, device, nullptr);
308+
cmdQ->requiresCacheFlushAfterWalker = true;
309+
auto &ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> &>(cmdQ->getGpgpuCommandStreamReceiver());
310+
ultCsr.multiOsContextCapable = isMultiEngine;
311+
312+
auto mockKernel = std::make_unique<MockKernelWithInternals>(*device);
313+
mockKernel->mockKernel->svmAllocationsRequireCacheFlush = svmAllocationRequiresCacheFlush;
314+
mockKernel->mockKernel->kernelArgRequiresCacheFlush.resize(1);
315+
mockKernel->mockKernel->kernelArgRequiresCacheFlush[0] = kernelArg;
316+
317+
auto flushRequired = mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ.get());
318+
EXPECT_TRUE(flushRequired);
319+
clearPlatform();
320+
}
321+
}
322+
}
323+
}
324+
}
325+
}
326+
} // namespace NEO

‎opencl/test/unit_test/kernel/kernel_tests.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3171,3 +3171,43 @@ TEST_F(KernelTests, givenKernelWithSimdEqual1WhenKernelCreatedThenMaxWorgGroupSi
31713171
EXPECT_LT(pKernel->getMaxKernelWorkGroupSize(), deviceMaxWorkGroupSize);
31723172
EXPECT_EQ(pKernel->getMaxKernelWorkGroupSize(), maxThreadsPerWG);
31733173
}
3174+
3175+
struct KernelLargeGrfTests : Test<ClDeviceFixture> {
3176+
void SetUp() override {
3177+
ClDeviceFixture::SetUp();
3178+
program = std::make_unique<MockProgram>(toClDeviceVector(*pClDevice));
3179+
pKernelInfo = std::make_unique<KernelInfo>();
3180+
pKernelInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize = 64;
3181+
}
3182+
3183+
void TearDown() override {
3184+
ClDeviceFixture::TearDown();
3185+
}
3186+
3187+
std::unique_ptr<MockProgram> program;
3188+
std::unique_ptr<KernelInfo> pKernelInfo;
3189+
SPatchExecutionEnvironment executionEnvironment = {};
3190+
};
3191+
3192+
HWTEST_F(KernelLargeGrfTests, GivenLargeGrfWhenGettingMaxWorkGroupSizeThenCorrectValueReturned) {
3193+
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 32;
3194+
pKernelInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize = 4;
3195+
pKernelInfo->kernelDescriptor.payloadMappings.implicitArgs.maxWorkGroupSize = 0;
3196+
{
3197+
MockKernel kernel(program.get(), *pKernelInfo, *pClDevice);
3198+
3199+
pKernelInfo->kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::LargeGrfNumber - 1;
3200+
EXPECT_EQ(CL_SUCCESS, kernel.initialize());
3201+
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, *kernel.maxWorkGroupSizeForCrossThreadData);
3202+
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, kernel.maxKernelWorkGroupSize);
3203+
}
3204+
3205+
{
3206+
MockKernel kernel(program.get(), *pKernelInfo, *pClDevice);
3207+
3208+
pKernelInfo->kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::LargeGrfNumber;
3209+
EXPECT_EQ(CL_SUCCESS, kernel.initialize());
3210+
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize >> 1, *kernel.maxWorkGroupSizeForCrossThreadData);
3211+
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize >> 1, kernel.maxKernelWorkGroupSize);
3212+
}
3213+
}

‎opencl/test/unit_test/test_files/igdrcl.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,5 +299,6 @@ EnableUserFenceUseCtxId = -1
299299
EnableResourceTags = 0
300300
SetKmdWaitTimeout = -1
301301
OverrideNotifyEnableForTagUpdatePostSync = -1
302+
EnableCacheFlushAfterWalkerForAllQueues = -1
302303
Force32BitDriverSupport = -1
303304
OverrideCmdQueueSynchronousMode = -1

‎shared/source/debug_settings/debug_variables_base.inl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, MinHwThreadsUnoccupied, 0, "If not zero then max
206206
DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushEveryEnqueueCount, -1, "If greater than 0, driver performs implicit flush every N submissions.")
207207
DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForNewResource, -1, "-1: platform specific, 0: force disable, 1: force enable")
208208
DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForIdleGpu, -1, "-1: platform specific, 0: force disable, 1: force enable")
209+
DECLARE_DEBUG_VARIABLE(int32_t, EnableCacheFlushAfterWalkerForAllQueues, -1, "Enable cache flush after walker even if queue doesn't require it")
209210

210211
/*DIRECT SUBMISSION FLAGS*/
211212
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD")

0 commit comments

Comments
 (0)
Please sign in to comment.