89#define DEBUG_TYPE "si-wqm"
98 StateStrict = StateStrictWWM | StateStrictWQM,
105 explicit PrintState(
int State) : State(State) {}
111 static const std::pair<char, const char *> Mapping[] = {
112 std::pair(StateWQM,
"WQM"), std::pair(StateStrictWWM,
"StrictWWM"),
113 std::pair(StateStrictWQM,
"StrictWQM"), std::pair(StateExact,
"Exact")};
114 char State = PS.State;
115 for (
auto M : Mapping) {
116 if (State & M.first) {
133 char MarkedStates = 0;
140 char InitialState = 0;
141 bool NeedsLowering =
false;
153class SIWholeQuadMode {
190 std::vector<WorkItem> &Worklist);
193 std::vector<WorkItem> &Worklist);
195 std::vector<WorkItem> &Worklist);
197 std::vector<WorkItem> &Worklist);
198 char scanInstructions(
MachineFunction &MF, std::vector<WorkItem> &Worklist,
200 void propagateInstruction(
MachineInstr &
MI, std::vector<WorkItem> &Worklist);
215 Register SaveOrig,
char StrictStateNeeded);
218 char NonStrictState,
char CurrentStrictState);
227 bool lowerLiveMaskQueries();
228 bool lowerCopyInstrs();
229 bool lowerKillInstrs(
bool IsWQM);
243 StringRef getPassName()
const override {
return "SI Whole Quad Mode"; }
260char SIWholeQuadModeLegacy::ID = 0;
273 return new SIWholeQuadModeLegacy;
278 for (
const auto &BII : Blocks) {
281 <<
" InNeeds = " << PrintState(BII.second.InNeeds)
282 <<
", Needs = " << PrintState(BII.second.Needs)
283 <<
", OutNeeds = " << PrintState(BII.second.OutNeeds) <<
"\n\n";
286 auto III = Instructions.find(&
MI);
287 if (III != Instructions.end()) {
288 dbgs() <<
" " <<
MI <<
" Needs = " << PrintState(III->second.Needs)
289 <<
", OutNeeds = " << PrintState(III->second.OutNeeds) <<
'\n';
296void SIWholeQuadMode::markInstruction(MachineInstr &
MI,
char Flag,
297 std::vector<WorkItem> &Worklist) {
298 InstrInfo &
II = Instructions[&
MI];
300 assert(!(Flag & StateExact) && Flag != 0);
309 Flag &= ~II.Disabled;
313 if ((
II.Needs & Flag) == Flag)
318 Worklist.emplace_back(&
MI);
322void SIWholeQuadMode::markDefs(
const MachineInstr &
UseMI,
LiveRange &LR,
323 VirtRegOrUnit VRegOrUnit,
unsigned SubReg,
324 char Flag, std::vector<WorkItem> &Worklist) {
334 const LaneBitmask UseLanes =
335 SubReg ?
TRI->getSubRegIndexLaneMask(SubReg)
346 LaneBitmask DefinedLanes;
348 PhiEntry(
const VNInfo *Phi,
unsigned PredIdx, LaneBitmask DefinedLanes)
349 :
Phi(
Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
351 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
353 SmallSet<VisitKey, 4> Visited;
354 LaneBitmask DefinedLanes;
355 unsigned NextPredIdx = 0;
357 const VNInfo *NextValue =
nullptr;
358 const VisitKey
Key(
Value, DefinedLanes);
365 if (
Value->isPHIDef()) {
368 assert(
MBB &&
"Phi-def has no defining MBB");
371 unsigned Idx = NextPredIdx;
374 for (; PI != PE && !NextValue; ++PI, ++Idx) {
376 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
386 assert(
MI &&
"Def has no defining instruction");
391 for (
const MachineOperand &
Op :
MI->all_defs()) {
396 LaneBitmask OpLanes =
398 :
TRI->getSubRegIndexLaneMask(
Op.getSubReg());
399 LaneBitmask Overlap = (UseLanes & OpLanes);
402 HasDef |= Overlap.
any();
405 DefinedLanes |= OpLanes;
409 if ((DefinedLanes & UseLanes) != UseLanes) {
412 if (
const VNInfo *VN = LRQ.
valueIn()) {
413 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
420 markInstruction(*
MI, Flag, Worklist);
423 markInstruction(*
MI, Flag, Worklist);
427 if (!NextValue && !PhiStack.
empty()) {
430 NextValue =
Entry.Phi;
431 NextPredIdx =
Entry.PredIdx;
432 DefinedLanes =
Entry.DefinedLanes;
440void SIWholeQuadMode::markOperand(
const MachineInstr &
MI,
441 const MachineOperand &
Op,
char Flag,
442 std::vector<WorkItem> &Worklist) {
449 case AMDGPU::EXEC_LO:
459 markDefs(
MI, LR, VirtRegOrUnit(
Reg),
Op.getSubReg(), Flag, Worklist);
468 markDefs(
MI, LR, VirtRegOrUnit(Unit), AMDGPU::NoSubRegister, Flag,
475void SIWholeQuadMode::markInstructionUses(
const MachineInstr &
MI,
char Flag,
476 std::vector<WorkItem> &Worklist) {
477 LLVM_DEBUG(
dbgs() <<
"markInstructionUses " << PrintState(Flag) <<
": "
480 for (
const MachineOperand &Use :
MI.all_uses())
481 markOperand(
MI, Use, Flag, Worklist);
486char SIWholeQuadMode::scanInstructions(
487 MachineFunction &MF, std::vector<WorkItem> &Worklist,
489 char GlobalFlags = 0;
491 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
492 bool HasImplicitDerivatives =
499 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
500 for (MachineBasicBlock *
MBB : RPOT) {
501 BlockInfo &BBI = Blocks[
MBB];
503 for (MachineInstr &
MI : *
MBB) {
504 InstrInfo &III = Instructions[&
MI];
505 unsigned Opcode =
MI.getOpcode();
508 if (
TII->isWQM(Opcode)) {
513 if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
517 markInstructionUses(
MI, StateWQM, Worklist);
518 GlobalFlags |= StateWQM;
520 }
else if (Opcode == AMDGPU::WQM) {
524 LowerToCopyInstrs.insert(&
MI);
525 }
else if (Opcode == AMDGPU::SOFT_WQM) {
526 LowerToCopyInstrs.insert(&
MI);
528 }
else if (Opcode == AMDGPU::STRICT_WWM) {
532 markInstructionUses(
MI, StateStrictWWM, Worklist);
533 GlobalFlags |= StateStrictWWM;
535 }
else if (Opcode == AMDGPU::STRICT_WQM ||
536 TII->isDualSourceBlendEXP(
MI)) {
540 markInstructionUses(
MI, StateStrictWQM, Worklist);
541 GlobalFlags |= StateStrictWQM;
543 if (Opcode == AMDGPU::STRICT_WQM) {
549 BBI.Needs |= StateExact;
550 if (!(BBI.InNeeds & StateExact)) {
551 BBI.InNeeds |= StateExact;
552 Worklist.emplace_back(
MBB);
554 GlobalFlags |= StateExact;
555 III.Disabled = StateWQM | StateStrict;
557 }
else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
558 Opcode == AMDGPU::DS_PARAM_LOAD ||
559 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
560 Opcode == AMDGPU::DS_DIRECT_LOAD) {
563 III.Needs |= StateStrictWQM;
564 GlobalFlags |= StateStrictWQM;
565 }
else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
567 III.Disabled = StateStrict;
568 MachineOperand &Inactive =
MI.getOperand(4);
569 if (Inactive.
isReg()) {
570 if (Inactive.
isUndef() &&
MI.getOperand(3).getImm() == 0)
571 LowerToCopyInstrs.insert(&
MI);
573 markOperand(
MI, Inactive, StateStrictWWM, Worklist);
576 BBI.NeedsLowering =
true;
577 }
else if (
TII->isDisableWQM(
MI)) {
578 BBI.Needs |= StateExact;
579 if (!(BBI.InNeeds & StateExact)) {
580 BBI.InNeeds |= StateExact;
581 Worklist.emplace_back(
MBB);
583 GlobalFlags |= StateExact;
584 III.Disabled = StateWQM | StateStrict;
585 }
else if (Opcode == AMDGPU::SI_PS_LIVE ||
586 Opcode == AMDGPU::SI_LIVE_MASK) {
588 }
else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
589 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
590 Opcode == AMDGPU::SI_DEMOTE_I1) {
592 BBI.NeedsLowering =
true;
593 }
else if (Opcode == AMDGPU::SI_INIT_EXEC ||
594 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
595 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
597 }
else if (WQMOutputs) {
602 for (
const MachineOperand &MO :
MI.defs()) {
605 TRI->hasVectorRegisters(
TRI->getPhysRegBaseClass(
Reg))) {
612 if (
TII->hasUnwantedEffectsWhenEXECEmpty(
MI)) {
613 for (
auto &
Op :
MI.uses()) {
616 if (!
TRI->isVectorRegister(*MRI,
Op.getReg()))
625 markInstruction(
MI, Flags, Worklist);
626 GlobalFlags |=
Flags;
635 if (GlobalFlags & StateWQM) {
636 for (MachineInstr *
MI : SetInactiveInstrs)
637 markInstruction(*
MI, StateWQM, Worklist);
638 for (MachineInstr *
MI : SoftWQMInstrs)
639 markInstruction(*
MI, StateWQM, Worklist);
645void SIWholeQuadMode::propagateInstruction(MachineInstr &
MI,
646 std::vector<WorkItem>& Worklist) {
647 MachineBasicBlock *
MBB =
MI.getParent();
648 InstrInfo
II = Instructions[&
MI];
649 BlockInfo &BI = Blocks[
MBB];
653 if ((
II.OutNeeds & StateWQM) && !(
II.Disabled & StateWQM) &&
654 (
MI.isTerminator() || (
TII->usesVM_CNT(
MI) &&
MI.mayStore()))) {
655 Instructions[&
MI].Needs = StateWQM;
660 if (
II.Needs & StateWQM) {
661 BI.Needs |= StateWQM;
662 if (!(BI.InNeeds & StateWQM)) {
663 BI.InNeeds |= StateWQM;
664 Worklist.emplace_back(
MBB);
669 if (MachineInstr *PrevMI =
MI.getPrevNode()) {
670 char InNeeds = (
II.Needs & ~StateStrict) |
II.OutNeeds;
671 if (!PrevMI->isPHI()) {
672 InstrInfo &PrevII = Instructions[PrevMI];
673 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
674 PrevII.OutNeeds |= InNeeds;
675 Worklist.emplace_back(PrevMI);
684 markInstructionUses(
MI,
II.Needs, Worklist);
688 if (
II.Needs & StateStrictWWM)
689 BI.Needs |= StateStrictWWM;
690 if (
II.Needs & StateStrictWQM)
691 BI.Needs |= StateStrictWQM;
694void SIWholeQuadMode::propagateBlock(MachineBasicBlock &
MBB,
695 std::vector<WorkItem>& Worklist) {
696 BlockInfo BI = Blocks[&
MBB];
701 InstrInfo &LastII = Instructions[LastMI];
702 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
703 LastII.OutNeeds |= BI.OutNeeds;
704 Worklist.emplace_back(LastMI);
710 BlockInfo &PredBI = Blocks[Pred];
711 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
714 PredBI.OutNeeds |= BI.InNeeds;
715 PredBI.InNeeds |= BI.InNeeds;
716 Worklist.emplace_back(Pred);
721 BlockInfo &SuccBI = Blocks[Succ];
722 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
725 SuccBI.InNeeds |= BI.OutNeeds;
726 Worklist.emplace_back(Succ);
730char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
731 std::vector<WorkItem> Worklist;
733 char GlobalFlags = scanInstructions(MF, Worklist, ExeczSideEffectInstrs);
735 while (!Worklist.empty()) {
736 WorkItem WI = Worklist.back();
740 propagateInstruction(*WI.MI, Worklist);
742 propagateBlock(*WI.MBB, Worklist);
744 if (Worklist.empty()) {
750 for (
auto *
MI : ExeczSideEffectInstrs) {
751 InstrInfo
II = Instructions[
MI];
752 if (
II.OutNeeds & StateWQM)
753 markInstructionUses(*
MI, StateWQM, Worklist);
757 ExeczSideEffectInstrs.clear();
765SIWholeQuadMode::saveSCC(MachineBasicBlock &
MBB,
772 MachineInstr *Restore =
783void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
784 MachineBasicBlock *BB = TermMI->
getParent();
788 MachineBasicBlock *SplitBB =
789 BB->
splitAt(*TermMI,
true, LIS);
793 unsigned NewOpcode = 0;
795 case AMDGPU::S_AND_B32:
796 NewOpcode = AMDGPU::S_AND_B32_term;
798 case AMDGPU::S_AND_B64:
799 NewOpcode = AMDGPU::S_AND_B64_term;
801 case AMDGPU::S_MOV_B32:
802 NewOpcode = AMDGPU::S_MOV_B32_term;
804 case AMDGPU::S_MOV_B64:
805 NewOpcode = AMDGPU::S_MOV_B64_term;
807 case AMDGPU::S_ANDN2_B32:
808 NewOpcode = AMDGPU::S_ANDN2_B32_term;
810 case AMDGPU::S_ANDN2_B64:
811 NewOpcode = AMDGPU::S_ANDN2_B64_term;
825 for (MachineBasicBlock *Succ : SplitBB->
successors()) {
826 DTUpdates.
push_back({DomTreeT::Insert, SplitBB, Succ});
827 DTUpdates.
push_back({DomTreeT::Delete, BB, Succ});
829 DTUpdates.
push_back({DomTreeT::Insert, BB, SplitBB});
837MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &
MI) {
852 switch (
MI.getOperand(2).getImm()) {
854 Opcode = AMDGPU::V_CMP_LG_F32_e64;
857 Opcode = AMDGPU::V_CMP_GE_F32_e64;
860 Opcode = AMDGPU::V_CMP_GT_F32_e64;
863 Opcode = AMDGPU::V_CMP_LE_F32_e64;
866 Opcode = AMDGPU::V_CMP_LT_F32_e64;
869 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
872 Opcode = AMDGPU::V_CMP_O_F32_e64;
875 Opcode = AMDGPU::V_CMP_U_F32_e64;
879 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
883 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
887 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
891 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
895 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
899 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
905 MachineBasicBlock &
MBB = *
MI.getParent();
908 MachineInstr *VcmpMI;
909 const MachineOperand &Op0 =
MI.getOperand(0);
910 const MachineOperand &Op1 =
MI.getOperand(1);
926 MachineInstr *MaskUpdateMI =
933 MachineInstr *EarlyTermMI =
936 MachineInstr *ExecMaskMI =
954MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &
MI,
bool IsWQM) {
957 MachineBasicBlock &
MBB = *
MI.getParent();
960 MachineInstr *MaskUpdateMI =
nullptr;
962 const bool IsDemote = IsWQM && (
MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
963 const MachineOperand &
Op =
MI.getOperand(0);
964 int64_t KillVal =
MI.getOperand(1).getImm();
965 MachineInstr *ComputeKilledMaskMI =
nullptr;
971 if (
Op.getImm() == KillVal) {
978 bool IsLastTerminator = std::next(
MI.getIterator()) ==
MBB.
end();
979 if (!IsLastTerminator) {
1011 MachineInstr *EarlyTermMI =
1016 MachineInstr *NewTerm;
1017 MachineInstr *WQMMaskMI =
nullptr;
1032 }
else if (!IsWQM) {
1050 if (ComputeKilledMaskMI)
1073void SIWholeQuadMode::lowerBlock(MachineBasicBlock &
MBB, BlockInfo &BI) {
1074 if (!BI.NeedsLowering)
1079 SmallVector<MachineInstr *, 4> SplitPoints;
1081 char State = BI.InitialState;
1085 auto MIState = StateTransition.find(&
MI);
1086 if (MIState != StateTransition.end())
1087 State = MIState->second;
1089 MachineInstr *SplitPoint =
nullptr;
1090 switch (
MI.getOpcode()) {
1091 case AMDGPU::SI_DEMOTE_I1:
1092 case AMDGPU::SI_KILL_I1_TERMINATOR:
1093 SplitPoint = lowerKillI1(
MI, State == StateWQM);
1095 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1096 SplitPoint = lowerKillF32(
MI);
1098 case AMDGPU::ENTER_STRICT_WWM:
1099 ActiveLanesReg =
MI.getOperand(0).getReg();
1101 case AMDGPU::EXIT_STRICT_WWM:
1104 case AMDGPU::V_SET_INACTIVE_B32:
1105 if (ActiveLanesReg) {
1106 LiveInterval &LI = LIS->
getInterval(
MI.getOperand(5).getReg());
1108 MI.getOperand(5).setReg(ActiveLanesReg);
1111 assert(State == StateExact || State == StateWQM);
1122 for (MachineInstr *
MI : SplitPoints)
1142 SlotIndex FirstIdx = FirstNonDbg != MBBE
1147 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1148 const LiveRange::Segment *S;
1157 if (
Next < FirstIdx)
1162 assert(EndMI &&
"Segment does not end on valid instruction");
1186 bool IsExecDef =
false;
1187 for (
const MachineOperand &MO :
MBBI->all_defs()) {
1189 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1203void SIWholeQuadMode::toExact(MachineBasicBlock &
MBB,
1208 bool IsTerminator = Before ==
MBB.
end();
1209 if (!IsTerminator) {
1211 if (FirstTerm !=
MBB.
end()) {
1214 IsTerminator = BeforeIdx > FirstTermIdx;
1235 StateTransition[
MI] = StateExact;
1238void SIWholeQuadMode::toWQM(MachineBasicBlock &
MBB,
1253 StateTransition[
MI] = StateWQM;
1256void SIWholeQuadMode::toStrictMode(MachineBasicBlock &
MBB,
1258 Register SaveOrig,
char StrictStateNeeded) {
1261 assert(StrictStateNeeded == StateStrictWWM ||
1262 StrictStateNeeded == StateStrictWQM);
1266 if (StrictStateNeeded == StateStrictWWM) {
1274 StateTransition[
MI] = StrictStateNeeded;
1277void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &
MBB,
1279 Register SavedOrig,
char NonStrictState,
1280 char CurrentStrictState) {
1284 assert(CurrentStrictState == StateStrictWWM ||
1285 CurrentStrictState == StateStrictWQM);
1289 if (CurrentStrictState == StateStrictWWM) {
1299 StateTransition[
MI] = NonStrictState;
1302void SIWholeQuadMode::processBlock(MachineBasicBlock &
MBB, BlockInfo &BI,
1306 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1307 BI.InitialState = StateWQM;
1316 bool WQMFromExec = IsEntry;
1317 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1318 char NonStrictState = 0;
1319 const TargetRegisterClass *BoolRC =
TRI->getBoolRC();
1324 if (
II != IE &&
II->getOpcode() == AMDGPU::COPY &&
1325 II->getOperand(1).getReg() == LMC.
ExecReg)
1340 BI.InitialState = State;
1342 for (
unsigned Idx = 0;; ++Idx) {
1344 char Needs = StateExact | StateWQM;
1350 if (FirstStrict == IE)
1354 if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
1360 MachineInstr &
MI = *
II;
1362 if (
MI.isTerminator() ||
TII->mayReadEXEC(*MRI,
MI)) {
1363 auto III = Instructions.find(&
MI);
1364 if (III != Instructions.end()) {
1365 if (III->second.Needs & StateStrictWWM)
1366 Needs = StateStrictWWM;
1367 else if (III->second.Needs & StateStrictWQM)
1368 Needs = StateStrictWQM;
1369 else if (III->second.Needs & StateWQM)
1372 Needs &= ~III->second.Disabled;
1373 OutNeeds = III->second.OutNeeds;
1378 Needs = StateExact | StateWQM | StateStrict;
1382 if (
MI.isBranch() && OutNeeds == StateExact)
1388 if (BI.OutNeeds & StateWQM)
1390 else if (BI.OutNeeds == StateExact)
1393 Needs = StateWQM | StateExact;
1397 if (!(Needs & State)) {
1399 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1400 State == StateStrictWQM || Needs == StateStrictWQM) {
1402 First = FirstStrict;
1409 bool SaveSCC =
false;
1412 case StateStrictWWM:
1413 case StateStrictWQM:
1417 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1421 SaveSCC = !(Needs & StateWQM);
1427 char StartState = State & StateStrict ? NonStrictState : State;
1429 StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1430 bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1431 !(Needs & StateExact);
1432 bool PreferLast = Needs == StateWQM;
1437 if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1439 if (
TII->hasUnwantedEffectsWhenEXECEmpty(*
I)) {
1440 PreferLast = WQMToExact;
1446 prepareInsertion(
MBB,
First,
II, PreferLast, SaveSCC);
1448 if (State & StateStrict) {
1449 assert(State == StateStrictWWM || State == StateStrictWQM);
1450 assert(SavedNonStrictReg);
1451 fromStrictMode(
MBB, Before, SavedNonStrictReg, NonStrictState, State);
1454 SavedNonStrictReg = 0;
1455 State = NonStrictState;
1458 if (Needs & StateStrict) {
1459 NonStrictState = State;
1460 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1461 assert(!SavedNonStrictReg);
1464 toStrictMode(
MBB, Before, SavedNonStrictReg, Needs);
1468 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1473 toExact(
MBB, Before, SavedWQMReg);
1475 }
else if (ExactToWQM) {
1476 assert(WQMFromExec == (SavedWQMReg == 0));
1478 toWQM(
MBB, Before, SavedWQMReg);
1494 if (Needs != (StateExact | StateWQM | StateStrict)) {
1495 if (Needs != (StateExact | StateWQM))
1506 assert(!SavedNonStrictReg);
1509bool SIWholeQuadMode::lowerLiveMaskQueries() {
1510 for (MachineInstr *
MI : LiveMaskQueries) {
1514 MachineInstr *
Copy =
1519 MI->eraseFromParent();
1521 return !LiveMaskQueries.empty();
1524bool SIWholeQuadMode::lowerCopyInstrs() {
1525 for (MachineInstr *
MI : LowerToMovInstrs) {
1526 assert(
MI->getNumExplicitOperands() == 2);
1530 const TargetRegisterClass *regClass =
1531 TRI->getRegClassForOperandReg(*MRI,
MI->getOperand(0));
1532 if (
TRI->isVGPRClass(regClass)) {
1533 const unsigned MovOp =
TII->getMovOpcode(regClass);
1534 MI->setDesc(
TII->get(MovOp));
1538 assert(
any_of(
MI->implicit_operands(), [](
const MachineOperand &MO) {
1539 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1545 if (
MI->getOperand(0).isEarlyClobber()) {
1547 MI->getOperand(0).setIsEarlyClobber(
false);
1550 int Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1551 while (Index >= 0) {
1552 MI->removeOperand(Index);
1553 Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1555 MI->setDesc(
TII->get(AMDGPU::COPY));
1559 for (MachineInstr *
MI : LowerToCopyInstrs) {
1562 if (
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1563 assert(
MI->getNumExplicitOperands() == 6);
1565 LiveInterval *RecomputeLI =
nullptr;
1566 if (
MI->getOperand(4).isReg())
1567 RecomputeLI = &LIS->
getInterval(
MI->getOperand(4).getReg());
1569 MI->removeOperand(5);
1570 MI->removeOperand(4);
1571 MI->removeOperand(3);
1572 MI->removeOperand(1);
1577 assert(
MI->getNumExplicitOperands() == 2);
1580 unsigned CopyOp =
MI->getOperand(1).isReg()
1581 ? (unsigned)AMDGPU::COPY
1582 :
TII->getMovOpcode(
TRI->getRegClassForOperandReg(
1583 *MRI,
MI->getOperand(0)));
1584 MI->setDesc(
TII->get(CopyOp));
1587 return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1590bool SIWholeQuadMode::lowerKillInstrs(
bool IsWQM) {
1591 for (MachineInstr *
MI : KillInstrs) {
1592 MachineInstr *SplitPoint =
nullptr;
1593 switch (
MI->getOpcode()) {
1594 case AMDGPU::SI_DEMOTE_I1:
1595 case AMDGPU::SI_KILL_I1_TERMINATOR:
1596 SplitPoint = lowerKillI1(*
MI, IsWQM);
1598 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1599 SplitPoint = lowerKillF32(*
MI);
1605 return !KillInstrs.empty();
1608void SIWholeQuadMode::lowerInitExec(MachineInstr &
MI) {
1609 MachineBasicBlock *
MBB =
MI.getParent();
1611 if (
MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1613 "init whole wave not in entry block");
1626 MI.eraseFromParent();
1635 if (
MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1639 .
addImm(
MI.getOperand(0).getImm());
1644 MI.eraseFromParent();
1655 Register InputReg =
MI.getOperand(0).getReg();
1656 MachineInstr *FirstMI = &*
MBB->
begin();
1658 MachineInstr *DefInstr = MRI->
getVRegDef(InputReg);
1661 if (DefInstr != FirstMI) {
1680 auto BfeMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_BFE_U32), CountReg)
1682 .
addImm((
MI.getOperand(1).getImm() & Mask) | 0x70000);
1686 auto CmpMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_CMP_EQ_U32))
1687 .
addReg(CountReg, RegState::Kill)
1693 MI.eraseFromParent();
1698 MI.eraseFromParent();
1713SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry,
bool &
Changed) {
1716 for (MachineInstr *
MI : InitExecInstrs) {
1720 if (
MI->getParent() == &Entry)
1721 InsertPt = std::next(
MI->getIterator());
1730bool SIWholeQuadMode::run(MachineFunction &MF) {
1732 <<
" ------------- \n");
1735 Instructions.clear();
1737 LiveMaskQueries.clear();
1738 LowerToCopyInstrs.clear();
1739 LowerToMovInstrs.clear();
1741 InitExecInstrs.clear();
1742 SetInactiveInstrs.
clear();
1743 StateTransition.clear();
1754 const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1755 const bool HasWaveModes = GlobalFlags & ~StateExact;
1756 const bool HasKills = !KillInstrs.empty();
1757 const bool UsesWQM = GlobalFlags & StateWQM;
1758 if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1769 for (MachineInstr *
MI : SetInactiveInstrs) {
1770 if (LowerToCopyInstrs.contains(
MI))
1772 auto &
Info = Instructions[
MI];
1773 if (
Info.MarkedStates & StateStrict) {
1774 Info.Needs |= StateStrictWWM;
1775 Info.Disabled &= ~StateStrictWWM;
1776 Blocks[
MI->getParent()].Needs |= StateStrictWWM;
1779 LowerToCopyInstrs.insert(
MI);
1785 Changed |= lowerLiveMaskQueries();
1788 if (!HasWaveModes) {
1790 Changed |= lowerKillInstrs(
false);
1791 }
else if (GlobalFlags == StateWQM) {
1797 lowerKillInstrs(
true);
1801 if (GlobalFlags & StateWQM)
1802 Blocks[&
Entry].InNeeds |= StateWQM;
1804 for (
auto &BII : Blocks)
1805 processBlock(*BII.first, BII.second, BII.first == &Entry);
1807 for (
auto &BII : Blocks)
1808 lowerBlock(*BII.first, BII.second);
1813 if (LiveMaskReg != LMC.
ExecReg)
1822 if (!KillInstrs.empty() || !InitExecInstrs.empty())
1828bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) {
1829 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1830 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1831 MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() :
nullptr;
1833 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1834 MachinePostDominatorTree *PDT =
1835 PDTWrapper ? &PDTWrapper->getPostDomTree() :
nullptr;
1836 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1837 return Impl.run(MF);
1850 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT, MachineLoopInfo *MLI)
SI Optimize VGPR LiveRange
unsigned getWavefrontSize() const
const unsigned AndSaveExecTermOpc
const unsigned AndTermOpc
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned OrSaveExecOpc
const unsigned AndSaveExecOpc
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
FunctionPass class - This class is used to implement most global optimizations.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
LLVM_ABI void handleMove(MachineInstr &MI, bool UpdateFlags=false)
Call this method to notify LiveIntervals that instruction MI has been moved within a basic block.
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
void removeInterval(Register Reg)
Interval removal.
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
LiveInterval & createAndComputeVirtRegInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
This class represents the liveness of a register, stack slot, etc.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarily including Idx,...
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
succ_iterator succ_begin()
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned succ_size() const
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
pred_iterator pred_begin()
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI MachineInstr * removeFromParent()
Unlink 'this' from the containing basic block, and return it without deleting it.
const MachineBasicBlock * getParent() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
This class implements a map that also provides access to all stored values in a deterministic order.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Wrapper class representing virtual and physical registers.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
A SetVector that performs no allocations if smaller than a certain size.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Wrapper class representing a virtual register or register unit.
constexpr bool isVirtualReg() const
constexpr Register asVirtualReg() const
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char WavefrontSize[]
Key for Kernel::CodeProps::Metadata::mWavefrontSize.
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
DominatorTreeBase< T, false > DomTreeBase
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionPass * createSIWholeQuadModeLegacyPass()
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
@ Disabled
Don't do any conversion of .debug_str_offsets tables.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
WorkItem(const BasicBlock *BB, int St)
static constexpr LaneBitmask getAll()
constexpr bool any() const
static constexpr LaneBitmask getNone()