Skip to content

Commit ac2cc85

Browse files
bcheng0127sys_zuul
authored andcommitted
BCR for step B TGLLP
Change-Id: Ie3856c842b4fc73b13c71f99328230f3ca177614
1 parent 3bc2e64 commit ac2cc85

File tree

6 files changed

+537
-4
lines changed

6 files changed

+537
-4
lines changed

visa/FlowGraph.cpp

Lines changed: 363 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4280,6 +4280,361 @@ static int getConflictTimesForTGLLP(std::ostream& output, int *firstRegCandidate
42804280
return conflictTimes;
42814281
}
42824282

4283+
static int getConflictTimesForTGL(std::ostream& output, int *firstRegCandidate, int &sameBankConflicts, bool zeroOne, bool isTGLLP)
4284+
{
4285+
int conflictTimes = 0;
4286+
int bundles[2][16];
4287+
int bankSrcs[2];
4288+
4289+
for (int i = 0; i < 2; i++)
4290+
{
4291+
for (int j = 0; j < 16; j++)
4292+
{
4293+
bundles[i][j] = -1;
4294+
}
4295+
bankSrcs[i] = 0;
4296+
}
4297+
4298+
output << "{";
4299+
for (int i = 0; i < G4_MAX_SRCS; i++)
4300+
{
4301+
bool same_register = false;
4302+
4303+
if (isValidReg(firstRegCandidate[i]))
4304+
{
4305+
for (int j = 0; j < i; j++)
4306+
{
4307+
if (isValidReg(firstRegCandidate[j]) && j != i)
4308+
{
4309+
if (firstRegCandidate[j] == firstRegCandidate[i])
4310+
{
4311+
same_register = true;
4312+
break;
4313+
}
4314+
}
4315+
}
4316+
4317+
if (same_register)
4318+
{
4319+
continue;
4320+
}
4321+
4322+
int bundleID = (firstRegCandidate[i] % 64) / 4;
4323+
if (isTGLLP)
4324+
{
4325+
bundleID = (firstRegCandidate[i] % 16) / 2;
4326+
}
4327+
4328+
int bankID = (firstRegCandidate[i] % 4) / 2;
4329+
if (zeroOne)
4330+
{
4331+
bankID = (firstRegCandidate[i]) % 2;
4332+
}
4333+
4334+
//Same bank and same bundle
4335+
if (bundles[bankID][bundleID] != -1) //Same bank and same bundle
4336+
{
4337+
conflictTimes++;
4338+
}
4339+
4340+
bundles[bankID][bundleID] = i;
4341+
bankSrcs[bankID]++;
4342+
if (bankID == 0)
4343+
{
4344+
output << "E:";
4345+
}
4346+
else
4347+
{
4348+
output << "O:";
4349+
}
4350+
output << bundleID << ",";
4351+
}
4352+
}
4353+
4354+
//Same bank but different bundles
4355+
if (conflictTimes == 0 &&
4356+
(bankSrcs[0] > 2 ||
4357+
bankSrcs[1] > 2))
4358+
{
4359+
conflictTimes++;
4360+
sameBankConflicts++;
4361+
}
4362+
else if (bankSrcs[0] > 2 ||
4363+
bankSrcs[1] > 2)
4364+
{
4365+
sameBankConflicts++;
4366+
}
4367+
4368+
output << "}, ";
4369+
4370+
return conflictTimes;
4371+
}
4372+
4373+
/*
4374+
* Gen12 BC evaluation
4375+
* In Gen12, there are 8 bundles and 2 banks per HW thread.
4376+
* Banks are divided according to EVEN/ODD of register index: 0101010101010101
4377+
* There are 8 bundles per 16 registers: 0011223344556677
4378+
* For two adjacent instructions: inst1 and inst2, inst1_src1(, inst1_src2) and inst2_src0 will be read in same cycle
4379+
* Considered HW swap and read suppresion mechanisms
4380+
* HW swap:
4381+
* The origional GRF register reading sequence for a three source instruction is: src0 in cycle0 and src1 and src2 in cycle2.
4382+
* HW swap mechanism detects the conflict between src1 and src2, if there is a conflict, HW will read src1 in cycle0 and src0 and src2 in cycle1.
4383+
* Note that:
4384+
* 1. for SIMD16, HW swap only happens when detecting conflicts in first simd8's registers. conflict in second simd8 will not trigger swap.
4385+
* 2. for SIMD16, when swapping happens, the src1 and src0 of both simd8 instructions will be swapped.
4386+
* Read suppression between instructions:
4387+
* The read suppression mechanism is used to save the GRF register reading operations with a register cache in HW. The suppression we talked here
4388+
* is the suppression between instructions. For each source operand slot, HW provide a GRF cache. With the cache, if the same GRF will be read in
4389+
* the instruction, the read will not happen, the cached value will be used directly.
4390+
* Note that:
4391+
* 1. The cache will only buffer the latest GRF which was read
4392+
* 2. The cache will be flushed if the buffered register is used as destination operand.
4393+
* 3. For SIMD16, if one source is scalar, the read suppression doen't happen, no matter within the SIMD16 instruction or with the following instruction.
4394+
* 4. The read suppression between instructions only happens in src1 and src2
4395+
* 5. 2 GRFs read suppression for src1 and 1 GRF read suppression for src0 and src2.
4396+
* Read suppression within a instruction:
4397+
* 1. Works for all source operands.
4398+
*
4399+
* suppressRegs is used as the read suppression buffer
4400+
* lastDst is used to keep dst register of last instruction. It's used to clear read suppression buffer. Once a register is defined, it's not buffered anymore
4401+
* lastRegs is used to keep the src1 and src2 of last instruction, in case there is conflict with current instruction GRF read
4402+
*/
4403+
uint32_t G4_BB::emitBankConflictGen12(std::ostream& os_output, G4_INST *inst, int *suppressRegs, int &sameConflictTimes, int &twoSrcConflicts, int &simd16RS, bool zeroOne, bool isTGLLP)
4404+
{
4405+
std::stringstream output;
4406+
4407+
parent->G12BCStats.addSIMD8();
4408+
4409+
if (inst->isSend() || inst->isMath() ||
4410+
inst->isSWSBSync() ||
4411+
inst->isWait() ||
4412+
inst->isReturn() || inst->isCall())
4413+
{ //Flush
4414+
for (int i = 0; i < 4; i++)
4415+
{
4416+
setInValidReg(suppressRegs[i]);
4417+
}
4418+
return 0;
4419+
}
4420+
4421+
int currInstRegs[2][G4_MAX_SRCS];
4422+
int currInstExecSize[G4_MAX_SRCS] = {0};
4423+
int firstRegCandidate[G4_MAX_SRCS];
4424+
int secondRegCandidate[G4_MAX_SRCS];
4425+
bool isScalar[G4_MAX_SRCS];
4426+
int candidateNum = 0;
4427+
int dstExecSize = 0;
4428+
int dstRegs[2];
4429+
4430+
for (int i = 0; i < G4_MAX_SRCS; i++)
4431+
{
4432+
setInValidReg(firstRegCandidate[i]);
4433+
setInValidReg(secondRegCandidate[i]);
4434+
setInValidReg(currInstRegs[0][i]);
4435+
setInValidReg(currInstRegs[1][i]);
4436+
isScalar[i] = false;
4437+
}
4438+
setInValidReg(dstRegs[0]);
4439+
setInValidReg(dstRegs[1]);
4440+
4441+
bool instSplit = false;
4442+
4443+
//Get Dst
4444+
G4_DstRegRegion* dstOpnd = inst->getDst();
4445+
if (dstOpnd &&
4446+
!dstOpnd->isIndirect() &&
4447+
dstOpnd->isGreg())
4448+
{
4449+
dstExecSize = dstOpnd->getLinearizedEnd() - dstOpnd->getLinearizedStart() + 1;
4450+
uint32_t byteAddress = dstOpnd->getLinearizedStart();
4451+
dstRegs[0] = byteAddress / GENX_GRF_REG_SIZ;
4452+
if (dstExecSize > 32)
4453+
{
4454+
dstRegs[1] = dstRegs[0] + (dstExecSize + GENX_GRF_REG_SIZ - 1) / GENX_GRF_REG_SIZ - 1;
4455+
instSplit = true;
4456+
}
4457+
}
4458+
4459+
//Get src
4460+
for (int i = 0; i < inst->getNumSrc(); i++)
4461+
{
4462+
setInValidReg(currInstRegs[0][i]);
4463+
setInValidReg(currInstRegs[1][i]);
4464+
G4_Operand * srcOpnd = inst->getSrc(i);
4465+
if (srcOpnd)
4466+
{
4467+
if (srcOpnd->isSrcRegRegion() &&
4468+
srcOpnd->asSrcRegRegion()->getBase() &&
4469+
srcOpnd->asSrcRegRegion()->getBase()->isRegVar())
4470+
{
4471+
G4_RegVar* baseVar = static_cast<G4_RegVar*>(srcOpnd->asSrcRegRegion()->getBase());
4472+
currInstExecSize[i] = srcOpnd->getLinearizedEnd() - srcOpnd->getLinearizedStart() + 1;
4473+
if (baseVar->isGreg()) {
4474+
uint32_t byteAddress = srcOpnd->getLinearizedStart();
4475+
currInstRegs[0][i] = byteAddress / GENX_GRF_REG_SIZ;
4476+
4477+
if (currInstExecSize[i] > 32)
4478+
{
4479+
currInstRegs[1][i] = currInstRegs[0][i] + 1;// (currInstExecSize[i] + GENX_GRF_REG_SIZ - 1) / GENX_GRF_REG_SIZ - 1;
4480+
instSplit = true;
4481+
}
4482+
else if (srcOpnd->asSrcRegRegion()->isScalar()) //No Read suppression for SIMD 16/scalar src
4483+
{
4484+
currInstRegs[1][i] = currInstRegs[0][i];
4485+
isScalar[i] = true;
4486+
}
4487+
else
4488+
{
4489+
setInValidReg(currInstRegs[1][i]);
4490+
}
4491+
}
4492+
}
4493+
}
4494+
}
4495+
4496+
if (instSplit)
4497+
{
4498+
parent->G12BCStats.addSIMD8();
4499+
}
4500+
4501+
bool lastInstSplit = suppressRegs[4] == 1;
4502+
4503+
if (instSplit != lastInstSplit)
4504+
{
4505+
for (int i = 0; i < 4; i++)
4506+
{
4507+
setInValidReg(suppressRegs[i]);
4508+
}
4509+
}
4510+
else
4511+
{
4512+
//Read Suppression for current instruction
4513+
output << " R{";
4514+
for (int i = 0; i < 3; i++)
4515+
{
4516+
if (instSplit && i != 1)
4517+
{
4518+
continue;
4519+
}
4520+
4521+
if (!instSplit && i == 1)
4522+
{
4523+
continue;
4524+
}
4525+
4526+
if (isValidReg(suppressRegs[i]) &&
4527+
currInstRegs[0][i] == suppressRegs[i] && !isScalar[i])
4528+
{
4529+
setInValidReg(currInstRegs[0][i]);
4530+
setInValidReg(currInstRegs[1][i]);
4531+
output << "r" << suppressRegs[i] << ",";
4532+
}
4533+
}
4534+
output << "}";
4535+
}
4536+
4537+
if (instSplit)
4538+
{
4539+
suppressRegs[4] = 1;
4540+
}
4541+
else
4542+
{
4543+
suppressRegs[4] = 0;
4544+
}
4545+
4546+
//Kill all previous read suppression candiadte if it wrote in DST
4547+
if (isValidReg(dstRegs[0]))
4548+
{
4549+
for (int i = 0; i < 4; i++)
4550+
{
4551+
if (suppressRegs[i] == dstRegs[0])
4552+
{
4553+
setInValidReg(suppressRegs[i]);
4554+
}
4555+
}
4556+
}
4557+
4558+
//No suppression, update the suppressRegs[0] for gen12lp
4559+
//suppressRegs[1], suppressRegs[2] will be updated with next instruction
4560+
int conflictTimes = 0;
4561+
for (int i = 0; i < 3; i++)
4562+
{
4563+
if (isValidReg(currInstRegs[0][i]))
4564+
{
4565+
firstRegCandidate[candidateNum] = currInstRegs[0][i];
4566+
candidateNum++;
4567+
}
4568+
}
4569+
4570+
if (candidateNum > 1)
4571+
{
4572+
conflictTimes = getConflictTimesForTGL(output, firstRegCandidate, sameConflictTimes, zeroOne, isTGLLP);
4573+
if (candidateNum == 2)
4574+
{
4575+
twoSrcConflicts += conflictTimes;
4576+
}
4577+
}
4578+
4579+
if (instSplit)
4580+
{
4581+
if (isValidReg(dstRegs[1]))
4582+
{
4583+
for (int i = 0; i < 4; i++)
4584+
{
4585+
if (suppressRegs[i] == dstRegs[1])
4586+
{
4587+
setInValidReg(suppressRegs[i]);
4588+
}
4589+
}
4590+
}
4591+
4592+
candidateNum = 0;
4593+
//For SIMD8, if any GRF0 of src1 or src2 of inst1 is GRF register
4594+
for (int i = 0; i < 3; i++)
4595+
{
4596+
if (isValidReg(currInstRegs[1][i]))
4597+
{
4598+
secondRegCandidate[candidateNum] = currInstRegs[1][i];
4599+
candidateNum++;
4600+
}
4601+
}
4602+
4603+
if (candidateNum > 1)
4604+
{
4605+
int c = 0;
4606+
c = getConflictTimesForTGL(output, secondRegCandidate, sameConflictTimes, zeroOne, isTGLLP);
4607+
conflictTimes += c;
4608+
if (candidateNum == 2)
4609+
{
4610+
twoSrcConflicts += c;
4611+
}
4612+
if (currInstExecSize[0] <= 16 || currInstExecSize[1] <= 16 || currInstExecSize[2] <= 16)
4613+
{
4614+
simd16RS += c;
4615+
}
4616+
}
4617+
}
4618+
4619+
for (int i = 0; i < 3; i++)
4620+
{
4621+
if (isValidReg(currInstRegs[0][i]))
4622+
{
4623+
suppressRegs[i] = currInstRegs[0][i];
4624+
}
4625+
}
4626+
4627+
if (conflictTimes != 0)
4628+
{
4629+
output << " {";
4630+
output << "BC=";
4631+
output << conflictTimes;
4632+
output << "}";
4633+
os_output << output.str();
4634+
}
4635+
4636+
return conflictTimes;
4637+
}
42834638

42844639
uint32_t G4_BB::emitBankConflictGen12lp(std::ostream& os_output, G4_INST *inst, int *suppressRegs, int *lastRegs, int &sameConflictTimes, int &twoSrcConflicts, int &simd16RS)
42854640
{
@@ -4617,7 +4972,14 @@ void G4_BB::emitBasicInstructionIga(char* instSyntax, std::ostream& output, INST
46174972
int twoSrcConflicts = 0;
46184973
int simd16SuppressionConflicts = 0;
46194974
unsigned BCNum = 0;
4620-
BCNum = emitBankConflictGen12lp(output, inst, suppressRegs, lastRegs, sameBankConflicts, twoSrcConflicts, simd16SuppressionConflicts);
4975+
if (getGenxPlatform() == GENX_TGLLP && GetStepping() == Step_A)
4976+
{
4977+
BCNum = emitBankConflictGen12lp(output, inst, suppressRegs, lastRegs, sameBankConflicts, twoSrcConflicts, simd16SuppressionConflicts);
4978+
}
4979+
else
4980+
{
4981+
BCNum = emitBankConflictGen12(output, inst, suppressRegs, sameBankConflicts, twoSrcConflicts, simd16SuppressionConflicts, false, true);
4982+
}
46214983
parent->G12BCStats.addBC(BCNum);
46224984
parent->G12BCStats.addSameBankBC(sameBankConflicts);
46234985
parent->G12BCStats.add2SrcBC(twoSrcConflicts);

visa/FlowGraph.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,7 @@ class G4_BB
438438

439439
uint32_t emitBankConflictGen12lp(std::ostream & os_output, G4_INST * inst, int * suppressRegs, int * lastRegs, int & sameConflictTimes, int & twoSrcConflicts, int & simd16RS);
440440
uint32_t countReadModifyWrite(std::ostream& os_output, G4_INST *inst);
441+
uint32_t emitBankConflictGen12(std::ostream & os_output, G4_INST * inst, int * suppressRegs, int & sameConflictTimes, int & twoSrcConflicts, int & simd16RS, bool zeroOne, bool isTGLLP);
441442
void emitDepInfo(std::ostream& output, G4_INST *inst, int offset);
442443

443444
bool isEndWithCall() const { return getLastOpcode() == G4_call; }

0 commit comments

Comments
 (0)