28#define DEBUG_TYPE "gcn-hazard-recognizer"
31 "Number of WMMA hazard V_NOPs hoisted from loops");
33 "Number of WMMA hazards where V_NOP hoisting was not possible");
37struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
40 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
42 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
45 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
55 cl::desc(
"Fill a percentage of the latency between "
56 "neighboring MFMA with s_nops."));
61 cl::desc(
"Insert a s_nop x before every instruction"));
65 cl::desc(
"Hoist WMMA hazard V_NOPs from loops to preheaders"));
76 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
77 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
78 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
79 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
80 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
85 EmittedInstrs.clear();
97 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
101 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
106 case AMDGPU::S_SETREG_B32:
107 case AMDGPU::S_SETREG_B32_mode:
108 case AMDGPU::S_SETREG_IMM32_B32:
109 case AMDGPU::S_SETREG_IMM32_B32_mode:
116 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
120 return Opcode == AMDGPU::S_RFE_B64;
125 case AMDGPU::S_MOVRELS_B32:
126 case AMDGPU::S_MOVRELS_B64:
127 case AMDGPU::S_MOVRELD_B32:
128 case AMDGPU::S_MOVRELD_B64:
137 if (
TII.isAlwaysGDS(
MI.getOpcode()))
140 switch (
MI.getOpcode()) {
141 case AMDGPU::S_SENDMSG:
142 case AMDGPU::S_SENDMSGHALT:
143 case AMDGPU::S_TTRACEDATA:
147 case AMDGPU::DS_PERMUTE_B32:
148 case AMDGPU::DS_BPERMUTE_B32:
151 if (
TII.isDS(
MI.getOpcode())) {
152 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
153 AMDGPU::OpName::gds);
154 if (
MI.getOperand(GDS).getImm())
162 unsigned Opcode =
MI.getOpcode();
163 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
164 Opcode == AMDGPU::V_PERMLANE64_B32 ||
165 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
169 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
171 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
176 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
186 AMDGPU::OpName::simm16);
203 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
206 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
210 if (!IsHazardRecognizerMode) {
211 if (checkWMMACoexecutionHazards(
MI) > 0)
215 if (ST.hasNoDataDepHazard())
227 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
230 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
235 checkMAIVALUHazards(
MI) > 0)
238 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
241 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
244 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
247 if (((ST.hasReadM0MovRelInterpHazard() &&
249 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
250 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
252 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
253 (ST.hasReadM0LdsDirectHazard() &&
254 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
255 checkReadM0Hazards(
MI) > 0)
262 checkMAILdStHazards(
MI) > 0)
265 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
273 while (Quantity > 0) {
274 unsigned Arg = std::min(Quantity, 8u);
282GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
283 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
284 assert(TSchedModel.getWriteProcResBegin(SC) !=
285 TSchedModel.getWriteProcResEnd(SC));
286 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
289void GCNHazardRecognizer::processBundle() {
293 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
294 CurrCycleInstr = &*
MI;
297 if (IsHazardRecognizerMode) {
298 fixHazards(CurrCycleInstr);
306 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
307 EmittedInstrs.push_front(
nullptr);
309 EmittedInstrs.push_front(CurrCycleInstr);
312 CurrCycleInstr =
nullptr;
316 assert(IsHazardRecognizerMode);
320 if (
MI->isInsideBundle())
330 IsHazardRecognizerMode =
true;
334 CurrCycleInstr =
nullptr;
349 return std::max(WaitStates, checkSMRDHazards(
MI));
351 if (ST.hasNSAtoVMEMBug())
352 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
354 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
356 if (ST.hasNoDataDepHazard())
360 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
363 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
366 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
369 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
372 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
376 checkMAIVALUHazards(
MI) > 0)
377 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
379 if (
MI->isInlineAsm())
380 return std::max(WaitStates, checkInlineAsmHazards(
MI));
383 return std::max(WaitStates, checkGetRegHazards(
MI));
386 return std::max(WaitStates, checkSetRegHazards(
MI));
389 return std::max(WaitStates, checkRFEHazards(
MI));
391 if ((ST.hasReadM0MovRelInterpHazard() &&
393 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
394 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
396 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
397 (ST.hasReadM0LdsDirectHazard() &&
398 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
399 return std::max(WaitStates, checkReadM0Hazards(
MI));
402 return std::max(WaitStates, checkMAIHazards(
MI));
405 return std::max(WaitStates, checkMAILdStHazards(
MI));
408 return std::max(WaitStates, checkPermlaneHazards(
MI));
414 EmittedInstrs.push_front(
nullptr);
420 if (!CurrCycleInstr) {
421 EmittedInstrs.push_front(
nullptr);
425 if (CurrCycleInstr->isBundle()) {
430 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
431 if (!NumWaitStates) {
432 CurrCycleInstr =
nullptr;
437 EmittedInstrs.push_front(CurrCycleInstr);
444 EmittedInstrs.push_front(
nullptr);
452 CurrCycleInstr =
nullptr;
456 assert(!IsHazardRecognizerMode &&
457 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
467template <
typename StateT>
477 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
482 static inline StateMapKey getEmptyKey() {
487 static unsigned getHashValue(
const StateMapKey &
Key) {
488 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
490 static unsigned getHashValue(
const StateT &State) {
491 return StateT::getHashValue(State);
493 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
494 const auto EKey = getEmptyKey();
495 if (StateMapKey::isEqual(
LHS, EKey) || StateMapKey::isEqual(
RHS, EKey))
496 return StateMapKey::isEqual(
LHS,
RHS);
497 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
499 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
500 if (StateMapKey::isEqual(
RHS, getEmptyKey()))
502 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
511 StateT State = InitialState;
514 unsigned WorkIdx = 0;
516 bool Expired =
false;
517 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
522 auto Result = IsHazard(State, *
I);
530 if (
I->isInlineAsm() ||
I->isMetaInstruction())
533 UpdateState(State, *
I);
537 unsigned StateIdx = States.
size();
538 StateMapKey
Key = {&States, StateIdx};
539 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
540 if (Insertion.second) {
543 StateIdx = Insertion.first->second;
546 Worklist.
insert(std::pair(Pred, StateIdx));
549 if (WorkIdx == Worklist.
size())
553 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
554 State = States[StateIdx];
555 I =
MBB->instr_rbegin();
572 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
580 if (
I->isInlineAsm())
583 WaitStates += GetNumWaitStates(*
I);
585 if (IsExpired(*
I, WaitStates))
586 return std::numeric_limits<int>::max();
589 int MinWaitStates = std::numeric_limits<int>::max();
591 if (!Visited.
insert(Pred).second)
595 IsExpired, Visited, GetNumWaitStates);
597 MinWaitStates = std::min(MinWaitStates, W);
600 return MinWaitStates;
611 std::next(
MI->getReverseIterator()), 0, IsExpired,
612 Visited, GetNumWaitStates);
615int GCNHazardRecognizer::getWaitStatesSince(
616 IsHazardFn IsHazard,
int Limit, GetNumWaitStatesFn GetNumWaitStates)
const {
617 if (IsHazardRecognizerMode) {
618 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
619 return WaitStates >= Limit;
621 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn,
626 for (MachineInstr *
MI : EmittedInstrs) {
631 if (
MI->isInlineAsm())
634 WaitStates +=
MI ? GetNumWaitStates(*
MI) : 1;
636 if (WaitStates >= Limit)
639 return std::numeric_limits<int>::max();
642int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
647int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
648 IsHazardFn IsHazardDef,
650 const SIRegisterInfo *TRI = ST.getRegisterInfo();
653 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
659int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
674 for (MCRegUnit Unit :
TRI.regunits(
Reg))
675 BV.
set(
static_cast<unsigned>(Unit));
687void GCNHazardRecognizer::addClauseInst(
const MachineInstr &
MI)
const {
699int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM)
const {
702 if (!ST.isXNACKEnabled())
705 bool IsSMRD = TII.isSMRD(*MEM);
719 for (MachineInstr *
MI : EmittedInstrs) {
731 if (ClauseDefs.none())
744 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
747int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD)
const {
748 int WaitStatesNeeded = 0;
750 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
753 if (!ST.hasSMRDReadVALUDefHazard())
754 return WaitStatesNeeded;
758 int SmrdSgprWaitStates = 4;
759 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
760 return TII.isVALU(
MI);
762 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
763 return TII.isSALU(
MI);
766 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
768 for (
const MachineOperand &Use :
SMRD->uses()) {
771 int WaitStatesNeededForUse =
772 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
774 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
784 int WaitStatesNeededForUse =
785 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
788 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
792 return WaitStatesNeeded;
795int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr *VMEM)
const {
796 if (!ST.hasVMEMReadSGPRVALUDefHazard())
799 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
803 const int VmemSgprWaitStates = 5;
804 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
805 return TII.isVALU(
MI);
807 for (
const MachineOperand &Use :
VMEM->uses()) {
808 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
811 int WaitStatesNeededForUse =
812 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
814 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
816 return WaitStatesNeeded;
820 const SIRegisterInfo *TRI = ST.getRegisterInfo();
821 const SIInstrInfo *TII = ST.getInstrInfo();
824 int DppVgprWaitStates = 2;
825 int DppExecWaitStates = 5;
826 int WaitStatesNeeded = 0;
827 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
828 return TII->isVALU(
MI);
831 for (
const MachineOperand &Use :
DPP->uses()) {
832 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
834 int WaitStatesNeededForUse =
835 DppVgprWaitStates - getWaitStatesSinceDef(
837 [](
const MachineInstr &) { return true; },
839 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
842 WaitStatesNeeded = std::max(
844 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
847 return WaitStatesNeeded;
850int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas)
const {
851 const SIInstrInfo *TII = ST.getInstrInfo();
855 const int DivFMasWaitStates = 4;
856 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
857 return TII->isVALU(
MI);
859 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
862 return DivFMasWaitStates - WaitStatesNeeded;
865int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr)
const {
866 const SIInstrInfo *TII = ST.getInstrInfo();
867 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
869 const int GetRegWaitStates = 2;
870 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
873 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
875 return GetRegWaitStates - WaitStatesNeeded;
878int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr)
const {
879 const SIInstrInfo *TII = ST.getInstrInfo();
880 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
882 const int SetRegWaitStates = ST.getSetRegWaitStates();
883 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
886 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
887 return SetRegWaitStates - WaitStatesNeeded;
890int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI)
const {
894 const SIInstrInfo *TII = ST.getInstrInfo();
895 unsigned Opcode =
MI.getOpcode();
896 const MCInstrDesc &
Desc =
MI.getDesc();
898 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
901 VDataRCID = TII->getOpRegClassID(
Desc.operands()[VDataIdx]);
903 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
913 if (ST.hasGFX940Insts())
915 const MachineOperand *SOffset =
916 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
917 if (!SOffset || !SOffset->
isReg())
926 if (TII->isMIMG(
MI)) {
927 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
929 Desc.operands()[SRsrcIdx])) == 256);
933 if (TII->isFLAT(
MI)) {
945int GCNHazardRecognizer::checkVALUHazardsHelper(
957 const SIRegisterInfo *TRI = ST.getRegisterInfo();
958 const SIInstrInfo *TII = ST.getInstrInfo();
960 int WaitStatesNeeded = 0;
961 if (!TRI->isVectorRegister(MRI,
Def.getReg()))
962 return WaitStatesNeeded;
965 const int MaxWaitStates = ST.hasGFX940Insts() ? 2 : 1;
970 auto WindowFor = [
this, TII](
const MachineInstr &
MI) ->
int {
971 if (!ST.hasGFX940Insts())
973 if (TII->isBUF(
MI)) {
974 const MachineOperand *SOffset =
975 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
976 if (SOffset && SOffset->
isReg())
986 auto Counter = [&](
const MachineInstr &
MI) {
987 int DataIdx = createsVALUHazard(
MI);
989 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg)) {
990 int Need = WindowFor(
MI) - Distance;
991 WaitStatesNeeded = std::max(WaitStatesNeeded, Need);
995 if (!
MI.isInlineAsm())
999 getWaitStatesSince(Counter, MaxWaitStates);
1001 return WaitStatesNeeded;
1017 unsigned Opcode =
MI.getOpcode();
1027 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
1029 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1035 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
1037 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1041 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
1043 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1049 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1070 for (
auto &Operand : VALU->operands()) {
1071 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1078int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU)
const {
1079 int WaitStatesNeeded = 0;
1082 const int TransDefWaitstates = 1;
1084 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1087 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1088 const SIInstrInfo *TII = ST.getInstrInfo();
1089 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1091 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1092 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1099 int WaitStatesNeededForDef =
1100 TransDefWaitstates -
1101 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1102 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1105 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1106 const int Shift16DefWaitstates = 1;
1108 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1109 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1110 const MachineOperand *ForwardedDst =
1116 if (ProducerMI.isInlineAsm()) {
1118 for (
auto &Def : ProducerMI.all_defs()) {
1127 int WaitStatesNeededForDef =
1128 Shift16DefWaitstates -
1129 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1130 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1133 if (ST.hasVDecCoExecHazard()) {
1134 const int VALUWriteSGPRVALUReadWaitstates = 2;
1135 const int VALUWriteEXECRWLane = 4;
1136 const int VALUWriteVGPRReadlaneRead = 1;
1138 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1139 const MachineRegisterInfo &MRI = MF.getRegInfo();
1141 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1144 return MI.modifiesRegister(
UseReg, TRI);
1147 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1152 if (TRI->isSGPRReg(MRI,
UseReg)) {
1153 int WaitStatesNeededForDef =
1154 VALUWriteSGPRVALUReadWaitstates -
1155 getWaitStatesSince(IsVALUDefSGPRFn,
1156 VALUWriteSGPRVALUReadWaitstates);
1157 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1161 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1163 int WaitStatesNeededForDef =
1164 VALUWriteSGPRVALUReadWaitstates -
1165 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1166 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1169 switch (
VALU->getOpcode()) {
1170 case AMDGPU::V_READLANE_B32:
1171 case AMDGPU::V_READFIRSTLANE_B32: {
1172 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1174 int WaitStatesNeededForDef =
1175 VALUWriteVGPRReadlaneRead -
1176 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1177 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1180 case AMDGPU::V_WRITELANE_B32: {
1182 int WaitStatesNeededForDef =
1183 VALUWriteEXECRWLane -
1184 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1185 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1195 if (!ST.has12DWordStoreHazard())
1196 return WaitStatesNeeded;
1198 const MachineRegisterInfo &MRI = MF.getRegInfo();
1200 for (
const MachineOperand &Def :
VALU->defs()) {
1201 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1204 return WaitStatesNeeded;
1207int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA)
const {
1216 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1217 !ST.hasCvtScaleForwardingHazard())
1220 const MachineRegisterInfo &MRI = MF.getRegInfo();
1221 int WaitStatesNeeded = 0;
1223 for (
const MachineOperand &
Op :
1225 if (
Op.isReg() &&
Op.isDef()) {
1226 if (!TRI.isVectorRegister(MRI,
Op.getReg()))
1229 if (ST.has12DWordStoreHazard()) {
1231 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op, MRI));
1236 if (ST.hasDstSelForwardingHazard()) {
1237 const int Shift16DefWaitstates = 1;
1239 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1243 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1244 IA->readsRegister(Dst->getReg(), &TRI);
1246 if (ProducerMI.isInlineAsm()) {
1248 for (
auto &Def : ProducerMI.all_defs()) {
1249 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1250 IA->readsRegister(
Def.getReg(), &TRI)) {
1259 int WaitStatesNeededForDef =
1260 Shift16DefWaitstates -
1261 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1262 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1265 return WaitStatesNeeded;
1268int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane)
const {
1269 const SIInstrInfo *TII = ST.getInstrInfo();
1270 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1271 const MachineRegisterInfo &MRI = MF.getRegInfo();
1273 const MachineOperand *LaneSelectOp =
1274 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1276 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->
getReg()))
1280 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isVALU(
MI); };
1282 const int RWLaneWaitStates = 4;
1283 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1285 return RWLaneWaitStates - WaitStatesSince;
1288int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE)
const {
1289 if (!ST.hasRFEHazards())
1292 const SIInstrInfo *TII = ST.getInstrInfo();
1294 const int RFEWaitStates = 1;
1299 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1300 return RFEWaitStates - WaitStatesNeeded;
1303int GCNHazardRecognizer::checkReadM0Hazards(
MachineInstr *
MI)
const {
1304 const SIInstrInfo *TII = ST.getInstrInfo();
1305 const int ReadM0WaitStates = 1;
1306 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1307 return ReadM0WaitStates -
1308 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1313 int WaitStatesNeeded,
bool IsHoisting) {
1315 for (
int I = 0;
I < WaitStatesNeeded; ++
I)
1316 BuildMI(
MBB, InsertPt,
DL, TII.get(AMDGPU::V_NOP_e32));
1320 fixVMEMtoScalarWriteHazards(
MI);
1321 fixVcmpxPermlaneHazards(
MI);
1322 fixSMEMtoVectorWriteHazards(
MI);
1323 fixVcmpxExecWARHazard(
MI);
1324 fixLdsBranchVmemWARHazard(
MI);
1325 if (ST.hasLdsDirect()) {
1326 fixLdsDirectVALUHazard(
MI);
1327 fixLdsDirectVMEMHazard(
MI);
1329 fixVALUPartialForwardingHazard(
MI);
1330 fixVALUTransUseHazard(
MI);
1331 fixVALUTransCoexecutionHazards(
MI);
1333 fixWMMACoexecutionHazards(
MI);
1334 fixShift64HighRegBug(
MI);
1335 fixVALUMaskWriteHazard(
MI);
1336 fixRequiredExportPriority(
MI);
1337 if (ST.requiresWaitIdleBeforeGetReg())
1338 fixGetRegWaitIdle(
MI);
1339 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1340 fixDsAtomicAsyncBarrierArriveB64(
MI);
1341 if (ST.hasScratchBaseForwardingHazard())
1342 fixScratchBaseForwardingHazard(
MI);
1343 if (ST.setRegModeNeedsVNOPs())
1349 return (
TII.isVOPC(
MI) ||
1350 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1351 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1354bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1358 const SIInstrInfo *TII = ST.getInstrInfo();
1359 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1365 unsigned Opc =
MI.getOpcode();
1367 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1371 std::numeric_limits<int>::max())
1377 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1379 bool IsUndef = Src0->isUndef();
1381 TII->get(AMDGPU::V_MOV_B32_e32))
1388bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1389 if (!ST.hasVMEMtoScalarWriteHazard())
1391 assert(!ST.hasExtendedWaitCounts());
1396 if (
MI->getNumDefs() == 0)
1399 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1405 for (
const MachineOperand &Def :
MI->defs()) {
1406 const MachineOperand *
Op =
1407 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1417 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1418 !
MI.getOperand(0).getImm()) ||
1419 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1424 std::numeric_limits<int>::max())
1427 const SIInstrInfo *TII = ST.getInstrInfo();
1429 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1434bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1435 if (!ST.hasSMEMtoVectorWriteHazard())
1437 assert(!ST.hasExtendedWaitCounts());
1442 AMDGPU::OpName SDSTName;
1443 switch (
MI->getOpcode()) {
1444 case AMDGPU::V_READLANE_B32:
1445 case AMDGPU::V_READFIRSTLANE_B32:
1446 SDSTName = AMDGPU::OpName::vdst;
1449 SDSTName = AMDGPU::OpName::sdst;
1453 const SIInstrInfo *TII = ST.getInstrInfo();
1454 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1456 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1458 for (
const auto &MO :
MI->implicit_operands()) {
1459 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1470 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1475 if (TII->isSALU(
MI)) {
1476 switch (
MI.getOpcode()) {
1477 case AMDGPU::S_SETVSKIP:
1478 case AMDGPU::S_VERSION:
1479 case AMDGPU::S_WAITCNT_VSCNT:
1480 case AMDGPU::S_WAITCNT_VMCNT:
1481 case AMDGPU::S_WAITCNT_EXPCNT:
1484 case AMDGPU::S_WAITCNT_LGKMCNT:
1486 return (
MI.getOperand(1).getImm() == 0) &&
1487 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1488 case AMDGPU::S_WAITCNT: {
1489 const int64_t
Imm =
MI.getOperand(0).getImm();
1496 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1497 "unexpected wait count instruction");
1499 if (TII->isSOPP(
MI))
1515 std::numeric_limits<int>::max())
1519 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1524bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1525 if (!ST.hasVcmpxExecWARHazard())
1527 assert(!ST.hasExtendedWaitCounts());
1532 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1533 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1539 return I.readsRegister(AMDGPU::EXEC, TRI);
1542 const SIInstrInfo *TII = ST.getInstrInfo();
1543 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1545 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1547 for (
auto MO :
MI.implicit_operands())
1548 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1551 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1558 std::numeric_limits<int>::max())
1562 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1569 if (!ST.hasLdsBranchVmemWARHazard())
1574 bool HasLds =
false;
1575 bool HasVmem =
false;
1576 for (
auto &
MBB : MF) {
1577 for (
auto &
MI :
MBB) {
1580 if (HasLds && HasVmem)
1588 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1589 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1590 !
I.getOperand(1).getImm();
1593bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1594 if (!RunLdsBranchVmemWARHazardFixup)
1597 assert(ST.hasLdsBranchVmemWARHazard());
1598 assert(!ST.hasExtendedWaitCounts());
1600 auto IsHazardInst = [](
const MachineInstr &
MI) {
1608 auto InstType = IsHazardInst(*
MI);
1612 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1616 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1620 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1621 auto InstType2 = IsHazardInst(
I);
1622 return InstType2 && InstType != InstType2;
1625 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1626 auto InstType2 = IsHazardInst(
I);
1627 if (InstType == InstType2)
1634 std::numeric_limits<int>::max();
1638 std::numeric_limits<int>::max())
1641 const SIInstrInfo *TII = ST.getInstrInfo();
1643 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1650bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1654 const int NoHazardWaitStates = 15;
1655 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1658 bool VisitedTrans =
false;
1659 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1664 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1666 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1667 if (WaitStates >= NoHazardWaitStates)
1673 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1677 DenseSet<const MachineBasicBlock *> Visited;
1679 std::next(
MI->getReverseIterator()), 0,
1687 MachineOperand *WaitVdstOp =
1688 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1689 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1694bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1698 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1701 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1704 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1706 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1709 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1711 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1712 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1715 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1719 std::numeric_limits<int>::max())
1722 if (LdsdirCanWait) {
1723 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1726 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1733bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1734 if (!ST.hasVALUPartialForwardingHazard())
1736 assert(!ST.hasExtendedWaitCounts());
1741 SmallSetVector<Register, 4> SrcVGPRs;
1743 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1744 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1749 if (SrcVGPRs.
size() <= 1)
1767 const int Intv1plus2MaxVALUs = 2;
1768 const int Intv3MaxVALUs = 4;
1769 const int IntvMaxVALUs = 6;
1770 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1773 SmallDenseMap<Register, int, 4> DefPos;
1774 int ExecPos = std::numeric_limits<int>::max();
1777 static unsigned getHashValue(
const StateType &State) {
1781 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1782 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1790 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1792 if (State.VALUs > NoHazardVALUWaitStates)
1798 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1806 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1807 State.DefPos[Src] = State.VALUs;
1812 if (State.ExecPos == std::numeric_limits<int>::max()) {
1813 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1814 State.ExecPos = State.VALUs;
1821 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1829 if (State.ExecPos == std::numeric_limits<int>::max())
1832 int PreExecPos = std::numeric_limits<int>::max();
1833 int PostExecPos = std::numeric_limits<int>::max();
1835 for (
auto Entry : State.DefPos) {
1836 int DefVALUs =
Entry.second;
1837 if (DefVALUs != std::numeric_limits<int>::max()) {
1838 if (DefVALUs >= State.ExecPos)
1839 PreExecPos = std::min(PreExecPos, DefVALUs);
1841 PostExecPos = std::min(PostExecPos, DefVALUs);
1846 if (PostExecPos == std::numeric_limits<int>::max())
1850 int Intv3VALUs = PostExecPos;
1851 if (Intv3VALUs > Intv3MaxVALUs)
1855 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1856 if (Intv2VALUs > Intv1plus2MaxVALUs)
1860 if (PreExecPos == std::numeric_limits<int>::max())
1864 int Intv1VALUs = PreExecPos - State.ExecPos;
1865 if (Intv1VALUs > Intv1plus2MaxVALUs)
1869 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1874 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1880 std::next(
MI->getReverseIterator())))
1884 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1890bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1891 if (!ST.hasVALUTransUseHazard())
1893 assert(!ST.hasExtendedWaitCounts());
1898 SmallSet<Register, 4> SrcVGPRs;
1900 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1901 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1915 const int IntvMaxVALUs = 5;
1916 const int IntvMaxTRANS = 1;
1922 static unsigned getHashValue(
const StateType &State) {
1925 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1926 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1933 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1935 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1941 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1948 if (
I.modifiesRegister(Src, &TRI)) {
1956 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1964 std::next(
MI->getReverseIterator())))
1970 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1976bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1977 if (!ST.hasGFX1250Insts() ||
1981 const SIInstrInfo *TII = ST.getInstrInfo();
1982 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1984 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
1989 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1990 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
1991 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1995 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1996 if (!ValuDst || !ValuDst->isReg())
2000 Register ValuDef = ValuDst->getReg();
2001 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
2002 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
2013 const int HasVALU = std::numeric_limits<int>::max();
2014 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
2017 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2025 const SIInstrInfo *TII = ST.getInstrInfo();
2026 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2028 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
2035 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2037 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2040 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2042 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2043 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2052 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2053 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2067 std::numeric_limits<int>::max())
2070 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2109 unsigned Category = 0;
2111 unsigned Latency = SchedModel.computeInstrLatency(&
MI);
2114 Category = IsSWMMAC ? 2 : 0;
2117 Category = IsSWMMAC ? 3 : 1;
2126int GCNHazardRecognizer::checkWMMACoexecutionHazards(
MachineInstr *
MI)
const {
2127 if (!ST.hasGFX1250Insts())
2130 const SIInstrInfo *TII = ST.getInstrInfo();
2139 const int WMMAWaitStates[] = {5, 9, 3, 5};
2140 const int VALUWaitStates[] = {4, 8, 2, 4};
2141 unsigned Category = 0;
2143 auto IsWMMAHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2144 if (!TII->isXDLWMMA(
I))
2148 return hasWMMAToWMMARegOverlap(
I, *
MI);
2151 auto IsVALUHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2152 if (!TII->isXDLWMMA(
I))
2156 return hasWMMAToVALURegOverlap(
I, *
MI);
2159 auto GetWaitStatesFn = [](
const MachineInstr &
I) {
2163 int WaitStatesNeeded = -1;
2164 int ExistingVALUs = 0;
2171 if (TII->isXDLWMMA(*
MI)) {
2172 const int WMMAWaitsLimit = 9;
2174 getWaitStatesSince(IsWMMAHazardFn, WMMAWaitsLimit, GetWaitStatesFn);
2175 WaitStatesNeeded = WMMAWaitStates[Category] - ExistingVALUs;
2177 const int VALUWaitsLimit = 8;
2179 getWaitStatesSince(IsVALUHazardFn, VALUWaitsLimit, GetWaitStatesFn);
2180 WaitStatesNeeded = VALUWaitStates[Category] - ExistingVALUs;
2183 return WaitStatesNeeded;
2186bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2188 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2189 Register A1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src0)->getReg();
2190 Register B1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src1)->getReg();
2193 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2197 Register Idx1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2198 if (TRI.regsOverlap(D0, Idx1))
2204bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2207 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2208 for (
const MachineOperand &ValuUse :
MI.explicit_uses()) {
2209 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2214 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2215 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2219 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2220 WMMARegs.push_back(Idx0);
2223 for (
const MachineOperand &ValuDef :
MI.defs()) {
2224 Register VDstReg = ValuDef.getReg();
2225 for (
Register WMMAReg : WMMARegs) {
2226 if (TRI.regsOverlap(VDstReg, WMMAReg))
2233bool GCNHazardRecognizer::isCoexecutionHazardFor(
const MachineInstr &
I,
2237 if (!TII.isXDLWMMA(
I))
2241 if (TII.isXDLWMMA(
MI))
2242 return hasWMMAToWMMARegOverlap(
I,
MI);
2244 return hasWMMAToVALURegOverlap(
I,
MI);
2250 bool IncludeSubloops) {
2253 for (MachineBasicBlock *
MBB :
L->getBlocks()) {
2254 if (!IncludeSubloops && MLI->getLoopFor(
MBB) != L)
2256 for (MachineInstr &
I : *
MBB) {
2259 if (isCoexecutionHazardFor(
I, *
MI))
2266bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(
MachineInstr *
MI,
2267 int WaitStatesNeeded) {
2271 MachineLoop *
L = MLI->getLoopFor(
MI->getParent());
2273 ++NumWMMAHoistingBailed;
2278 if (hasWMMAHazardInLoop(L,
MI)) {
2279 ++NumWMMAHoistingBailed;
2284 MachineLoop *TargetLoop =
L;
2286 if (hasWMMAHazardInLoop(Parent,
MI,
false))
2288 TargetLoop = Parent;
2294 ++NumWMMAHoistingBailed;
2298 LLVM_DEBUG(
dbgs() <<
"WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2304 NumWMMANopsHoisted += WaitStatesNeeded;
2308bool GCNHazardRecognizer::fixWMMACoexecutionHazards(
MachineInstr *
MI) {
2309 int WaitStatesNeeded = checkWMMACoexecutionHazards(
MI);
2310 if (WaitStatesNeeded <= 0)
2316 emitVNops(*
MI->getParent(),
MI->getIterator(), WaitStatesNeeded);
2320bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2321 if (!ST.hasShift64HighRegBug())
2323 assert(!ST.hasExtendedWaitCounts());
2325 switch (
MI->getOpcode()) {
2328 case AMDGPU::V_LSHLREV_B64_e64:
2329 case AMDGPU::V_LSHRREV_B64_e64:
2330 case AMDGPU::V_ASHRREV_I64_e64:
2334 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2339 const MachineRegisterInfo &MRI = MF.getRegInfo();
2341 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2344 if (AmtReg != AMDGPU::VGPR255 && MRI.
isPhysRegUsed(AmtReg + 1))
2347 assert(ST.needsAlignedVGPRs());
2348 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2351 MachineBasicBlock *
MBB =
MI->getParent();
2352 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2363 Register DstReg =
MI->getOperand(0).getReg();
2365 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2373 bool Overlapped =
MI->modifiesRegister(AmtReg, &TRI);
2375 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2376 : AMDGPU::VGPR_32RegClass) {
2377 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2383 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2388 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2401 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2408 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2414 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2428 MI->getOperand(0).setReg(NewReg);
2437int GCNHazardRecognizer::checkNSAtoVMEMHazard(
MachineInstr *
MI)
const {
2438 int NSAtoVMEMWaitStates = 1;
2440 if (!ST.hasNSAtoVMEMBug())
2446 const SIInstrInfo *TII = ST.getInstrInfo();
2447 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2455 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2456 TII->getInstSizeInBytes(
I) >= 16;
2459 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2462int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2464 int FPAtomicToDenormModeWaitStates = 3;
2466 if (!ST.hasFPAtomicToDenormModeHazard())
2468 assert(!ST.hasExtendedWaitCounts());
2470 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2479 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2486 return FPAtomicToDenormModeWaitStates -
2490int GCNHazardRecognizer::checkMAIHazards(
MachineInstr *
MI)
const {
2493 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2496int GCNHazardRecognizer::checkMFMAPadding(
MachineInstr *
MI)
const {
2501 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2505 int NeighborMFMALatency = 0;
2506 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2507 this](
const MachineInstr &
MI) {
2511 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2515 const int MaxMFMAPipelineWaitStates = 16;
2516 int WaitStatesSinceNeighborMFMA =
2517 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2519 int NeighborMFMAPaddingNeeded =
2521 WaitStatesSinceNeighborMFMA;
2523 return std::max(0, NeighborMFMAPaddingNeeded);
2526int GCNHazardRecognizer::checkMAIHazards908(
MachineInstr *
MI)
const {
2527 int WaitStatesNeeded = 0;
2528 unsigned Opc =
MI->getOpcode();
2530 auto IsVALUFn = [](
const MachineInstr &
MI) {
2534 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2535 const int LegacyVALUWritesVGPRWaitStates = 2;
2536 const int VALUWritesExecWaitStates = 4;
2537 const int MaxWaitStates = 4;
2539 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2540 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2541 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2543 if (WaitStatesNeeded < MaxWaitStates) {
2544 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2545 const int MaxWaitStates = 2;
2547 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2550 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2551 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2552 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2554 if (WaitStatesNeeded == MaxWaitStates)
2560 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2561 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2564 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2567 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2568 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2569 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2570 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2571 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2572 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2573 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2574 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2575 const int MaxWaitStates = 18;
2577 unsigned HazardDefLatency = 0;
2579 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2580 this](
const MachineInstr &
MI) {
2587 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2588 return TRI.regsOverlap(DstReg,
Reg);
2591 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2593 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2594 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2595 int OpNo =
Op.getOperandNo();
2596 if (OpNo == SrcCIdx) {
2597 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2598 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2599 switch (HazardDefLatency) {
2600 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2602 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2604 case 16: [[fallthrough]];
2605 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2608 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2609 switch (HazardDefLatency) {
2610 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2612 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2614 case 16: [[fallthrough]];
2615 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2620 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2621 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2623 if (WaitStatesNeeded == MaxWaitStates)
2624 return WaitStatesNeeded;
2626 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2627 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2630 return TRI.regsOverlap(
Reg, DstReg);
2633 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2634 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2635 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2636 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2637 if (OpNo == SrcCIdx)
2638 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2639 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2640 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2642 WaitStatesNeededForUse = NeedWaitStates -
2643 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2644 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2646 if (WaitStatesNeeded == MaxWaitStates)
2647 return WaitStatesNeeded;
2650 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2651 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2652 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2653 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2654 const int MaxWaitStates = 13;
2655 Register DstReg =
MI->getOperand(0).getReg();
2656 unsigned HazardDefLatency = 0;
2658 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2659 this](
const MachineInstr &
MI) {
2662 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2664 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2665 return TRI.regsOverlap(
Reg, DstReg);
2668 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2670 switch (HazardDefLatency) {
2671 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2673 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2675 case 16: [[fallthrough]];
2676 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2680 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2681 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2685 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2687 return WaitStatesNeeded;
2698 return NumPasses + 1 + IsGFX950;
2709 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2727 return NumPasses + 2;
2737 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2740int GCNHazardRecognizer::checkMAIHazards90A(
MachineInstr *
MI)
const {
2741 int WaitStatesNeeded = 0;
2742 unsigned Opc =
MI->getOpcode();
2744 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2748 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2754 return WaitStatesNeeded;
2756 const int VALUWritesExecWaitStates = 4;
2757 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2758 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2759 VALUWritesExecWaitStates);
2760 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2762 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2765 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2766 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2767 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2768 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2769 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2770 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2771 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2772 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2773 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2774 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2775 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2776 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2777 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2778 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2779 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2780 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2781 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2782 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2783 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2784 const int MaxWaitStates = 19;
2790 const MachineInstr *MI1;
2792 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2793 this](
const MachineInstr &
MI) {
2797 FullReg = (DstReg ==
Reg);
2799 return TRI.regsOverlap(DstReg,
Reg);
2802 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2803 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2804 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2807 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2808 if (NumWaitStates == std::numeric_limits<int>::max())
2811 int OpNo =
Use.getOperandNo();
2813 int NeedWaitStates = 0;
2814 if (OpNo == SrcCIdx) {
2818 }
else if (FullReg) {
2819 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2820 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2821 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2822 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2823 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2824 else if (ST.hasGFX940Insts() &&
2825 TSchedModel.computeInstrLatency(MI1) == 2)
2826 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2829 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2830 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2831 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2832 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2833 if (!TII.isXDL(*
MI))
2836 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2837 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2839 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2840 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2841 if (!TII.isXDL(*
MI))
2842 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2845 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2846 if (ST.hasGFX940Insts()) {
2847 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2854 NumPasses, ST.hasGFX950Insts())
2856 NumPasses, ST.hasGFX950Insts()))
2862 switch (NumPasses) {
2866 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2867 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2872 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2873 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2878 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2879 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2888 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2889 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2890 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2891 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2894 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2895 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2897 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2898 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2899 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2902 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2904 if (ST.hasGFX940Insts()) {
2908 NumPasses, ST.hasGFX950Insts())
2914 switch (NumPasses) {
2916 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2921 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2925 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2929 if (WaitStatesNeeded >= NeedWaitStates)
2932 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2933 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2935 if (WaitStatesNeeded == MaxWaitStates)
2940 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2942 return WaitStatesNeeded;
2945int GCNHazardRecognizer::checkMAILdStHazards(
MachineInstr *
MI)
const {
2947 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2950 int WaitStatesNeeded = 0;
2952 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
2953 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2956 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2957 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
2962 const int AccVgprReadLdStWaitStates = 2;
2963 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2964 const int MaxWaitStates = 2;
2966 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2967 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
2968 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2970 if (WaitStatesNeeded == MaxWaitStates)
2971 return WaitStatesNeeded;
2973 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
2974 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2975 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2977 auto IsVALUFn = [](
const MachineInstr &
MI) {
2980 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
2981 std::numeric_limits<int>::max();
2984 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2985 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2986 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2989 return WaitStatesNeeded;
2992int GCNHazardRecognizer::checkPermlaneHazards(
MachineInstr *
MI)
const {
2993 assert(!ST.hasVcmpxPermlaneHazard() &&
2994 "this is a different vcmpx+permlane hazard");
2995 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2996 const SIInstrInfo *TII = ST.getInstrInfo();
2998 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
3002 auto IsVALUFn = [](
const MachineInstr &
MI) {
3006 const int VCmpXWritesExecWaitStates = 4;
3007 const int VALUWritesVDstWaitStates = 2;
3008 int WaitStatesNeeded = 0;
3010 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
3011 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
3015 int WaitStatesSinceDef =
3016 VALUWritesVDstWaitStates -
3017 getWaitStatesSinceDef(
Reg, IsVALUFn,
3018 VALUWritesVDstWaitStates);
3019 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
3020 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
3024 int VCmpXHazardWaits =
3025 VCmpXWritesExecWaitStates -
3026 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3028 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3029 return WaitStatesNeeded;
3037 return NumPasses + 2;
3047 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3057 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3065 return NumPasses + 2;
3068int GCNHazardRecognizer::checkMAIVALUHazards(
MachineInstr *
MI)
const {
3069 if (!ST.hasGFX90AInsts())
3072 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
3080 const MachineRegisterInfo &MRI = MF.getRegInfo();
3082 int WaitStatesNeeded = 0;
3088 const MachineInstr *
MFMA =
nullptr;
3090 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3092 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3098 const MachineInstr *
DOT =
nullptr;
3099 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
3101 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3107 bool DGEMMAfterVALUWrite =
false;
3108 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
3111 DGEMMAfterVALUWrite =
true;
3115 if (!TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
3121 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
3122 AMDGPU::OpName::src2);
3124 if (IsMemOrExport || IsVALU) {
3125 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3126 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3127 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3128 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3129 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3130 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3131 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3132 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3133 const int DotWriteSameDotReadSrcAB = 3;
3134 const int DotWriteDifferentVALURead = 3;
3135 const int DMFMABetweenVALUWriteVMEMRead = 2;
3136 const int MaxWaitStates = 19;
3138 for (
const MachineOperand &Use :
MI->explicit_uses()) {
3144 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3147 int NeedWaitStates = 0;
3148 if (
DOT->getOpcode() ==
MI->getOpcode()) {
3149 if (&Use - &
MI->getOperand(0) != SrcCIdx)
3150 NeedWaitStates = DotWriteSameDotReadSrcAB;
3152 NeedWaitStates = DotWriteDifferentVALURead;
3155 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3163 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3164 DGEMMAfterVALUWrite =
false;
3165 if (TRI.isVectorRegister(MRI,
Reg)) {
3166 int WaitStatesNeededForUse =
3167 DMFMABetweenVALUWriteVMEMRead -
3168 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3169 DMFMABetweenVALUWriteVMEMRead);
3171 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3176 WaitStatesSinceDef =
3177 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3181 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3182 int NumPasses = HazardDefLatency;
3183 int NeedWaitStates = MaxWaitStates;
3186 switch (HazardDefLatency) {
3188 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3189 : DMFMA4x4WriteVgprVALUReadWaitStates;
3195 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3196 : (ST.hasGFX950Insts()
3197 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3198 : DMFMA16x16WriteVgprVALUReadWaitStates);
3203 }
else if (ST.hasGFX940Insts()) {
3207 NumPasses, ST.hasGFX950Insts())
3211 switch (HazardDefLatency) {
3213 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3216 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3219 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3226 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3227 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3229 if (WaitStatesNeeded == MaxWaitStates)
3234 unsigned Opc =
MI->getOpcode();
3235 const int DMFMAToFMA64WaitStates = 2;
3236 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3237 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3238 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3239 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3240 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3241 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3242 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3245 if (!IsVALU && !IsMemOrExport)
3246 return WaitStatesNeeded;
3248 for (
const MachineOperand &Def :
MI->defs()) {
3249 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3250 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3251 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3252 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3253 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3254 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3255 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3256 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3257 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3258 const int DotWriteDifferentVALUWrite = 3;
3259 const int MaxWaitStates = 19;
3260 const int MaxWarWaitStates = 15;
3265 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3267 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3268 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3269 WaitStatesSinceDef);
3272 WaitStatesSinceDef =
3273 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3275 int NeedWaitStates = MaxWaitStates;
3276 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3279 switch (NumPasses) {
3281 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3285 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3290 }
else if (ST.hasGFX940Insts()) {
3294 NumPasses, ST.hasGFX950Insts())
3297 switch (NumPasses) {
3299 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3302 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3305 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3312 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3313 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3315 if (WaitStatesNeeded == MaxWaitStates)
3319 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3321 !
MI.readsRegister(
Reg, &TRI))
3324 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3327 const MachineOperand *SrcC =
3328 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3338 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3343 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3344 int NeedWaitStates = MaxWaitStates;
3345 switch (HazardDefLatency) {
3346 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3348 case 4:
assert(ST.hasGFX940Insts());
3349 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3351 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3353 case 16: [[fallthrough]];
3354 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3358 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3359 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3362 return WaitStatesNeeded;
3375 return MAI !=
nullptr;
3379 if (IsMFMAFn(*
MI)) {
3380 int W = getWaitStatesSince(IsMFMAFn, 16);
3382 return W < (int)TSchedModel.computeInstrLatency(MAI);
3396 while (
I->isBundledWithPred())
3402 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3406 const unsigned NewBytes = 4;
3408 "Unexpected instruction insertion in bundle");
3411 while (NextMI != End && NextMI->isBundledWithPred()) {
3412 for (
auto &Operand : NextMI->operands()) {
3413 if (Operand.isGlobal())
3414 Operand.setOffset(Operand.getOffset() + NewBytes);
3420bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3421 if (!ST.hasVALUMaskWriteHazard())
3423 assert(!ST.hasExtendedWaitCounts());
3430 if (!IsSALU && !IsVALU)
3442 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3443 const MachineRegisterInfo &MRI = MF.getRegInfo();
3448 case AMDGPU::EXEC_LO:
3449 case AMDGPU::EXEC_HI:
3451 case AMDGPU::SGPR_NULL:
3452 case AMDGPU::SGPR_NULL64:
3460 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
3464 SmallSet<Register, 2> HazardSGPRs;
3466 static unsigned getHashValue(
const StateType &State) {
3469 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
3470 return LHS.HazardSGPRs ==
RHS.HazardSGPRs;
3474 SmallVector<const MachineInstr *> WaitInstrs;
3475 bool HasSGPRRead =
false;
3476 StateType InitialState;
3479 MachineOperand *HazardDef =
nullptr;
3480 for (MachineOperand &
Op :
MI->operands()) {
3483 if (
Op.isDef() && HazardDef)
3487 if (IgnoreableSGPR(
Reg))
3490 if (
Op.isImplicit())
3492 if (!TRI->isSGPRReg(MRI,
Reg))
3510 if (AMDGPU::SReg_32RegClass.
contains(HazardReg)) {
3511 InitialState.HazardSGPRs.insert(HazardReg);
3514 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3515 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3518 auto IsHazardFn = [&](StateType &State,
const MachineInstr &
I) {
3519 if (State.HazardSGPRs.empty())
3522 switch (
I.getOpcode()) {
3523 case AMDGPU::V_ADDC_U32_e32:
3524 case AMDGPU::V_ADDC_U32_dpp:
3525 case AMDGPU::V_CNDMASK_B16_t16_e32:
3526 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3527 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3528 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3529 case AMDGPU::V_CNDMASK_B32_e32:
3530 case AMDGPU::V_CNDMASK_B32_dpp:
3531 case AMDGPU::V_DIV_FMAS_F32_e64:
3532 case AMDGPU::V_DIV_FMAS_F64_e64:
3533 case AMDGPU::V_SUBB_U32_e32:
3534 case AMDGPU::V_SUBB_U32_dpp:
3535 case AMDGPU::V_SUBBREV_U32_e32:
3536 case AMDGPU::V_SUBBREV_U32_dpp: {
3540 case AMDGPU::V_ADDC_U32_e64:
3541 case AMDGPU::V_ADDC_U32_e64_dpp:
3542 case AMDGPU::V_CNDMASK_B16_t16_e64:
3543 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3544 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3545 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3546 case AMDGPU::V_CNDMASK_B32_e64:
3547 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3548 case AMDGPU::V_SUBB_U32_e64:
3549 case AMDGPU::V_SUBB_U32_e64_dpp:
3550 case AMDGPU::V_SUBBREV_U32_e64:
3551 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3553 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3555 bool Result = TRI->regsOverlap(SSRCOp->
getReg(), HazardReg);
3567 auto UpdateStateFn = [&](StateType &State,
const MachineInstr &
I) {
3568 switch (
I.getOpcode()) {
3569 case AMDGPU::S_WAITCNT_DEPCTR:
3571 if (!HasSGPRRead &&
I.getParent() ==
MI->getParent() && !
I.isBundled() &&
3572 (
I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3577 for (
auto &
Op :
I.operands()) {
3582 if (IgnoreableSGPR(
Reg))
3585 if (
Op.isImplicit())
3587 if (!TRI->isSGPRReg(MRI,
Reg))
3598 for (
Register SGPR : State.HazardSGPRs) {
3599 if (
Reg == SGPR || TRI->regsOverlap(
Reg, SGPR))
3603 State.HazardSGPRs.erase(SGPR);
3612 std::next(
MI->getReverseIterator())))
3622 if (!WaitInstrs.
empty()) {
3626 SmallVector<MachineInstr *> ToErase;
3628 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3629 End = MI->getParent()->rend();
3630 Found < WaitInstrs.size() && It != End; ++It) {
3631 MachineInstr *WaitMI = &*It;
3633 if (std::as_const(WaitMI) != WaitInstrs[Found])
3636 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3637 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3638 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3639 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3640 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3641 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3642 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3643 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3644 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3645 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3646 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3647 ToErase.push_back(WaitMI);
3650 for (MachineInstr *WaitMI : ToErase)
3651 WaitMI->eraseFromParent();
3655 auto NextMI = std::next(
MI->getIterator());
3656 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3657 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3669 if (EntryMBB.
begin() != EntryMBB.
end()) {
3670 auto &EntryMI = *EntryMBB.
begin();
3671 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3672 EntryMI.getOperand(0).getImm() >= Priority)
3681bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3682 if (!ST.hasRequiredExportPriority())
3687 MachineBasicBlock *
MBB =
MI->getParent();
3700 const int MaxPriority = 3;
3701 const int NormalPriority = 2;
3702 const int PostExportPriority = 0;
3704 auto It =
MI->getIterator();
3705 switch (
MI->getOpcode()) {
3706 case AMDGPU::S_ENDPGM:
3707 case AMDGPU::S_ENDPGM_SAVED:
3708 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3709 case AMDGPU::SI_RETURN_TO_EPILOG:
3712 if (MF->getFrameInfo().hasCalls())
3715 case AMDGPU::S_SETPRIO: {
3717 auto &PrioOp =
MI->getOperand(0);
3718 int Prio = PrioOp.getImm();
3719 bool InWA = (Prio == PostExportPriority) &&
3720 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3721 if (InWA || Prio >= NormalPriority)
3723 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3727 if (!TII.isEXP(*
MI))
3738 auto NextMI = std::next(It);
3739 bool EndOfShader =
false;
3740 if (NextMI !=
MBB->
end()) {
3742 if (TII.isEXP(*NextMI))
3745 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3746 NextMI->getOperand(0).getImm() == PostExportPriority)
3748 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3755 .
addImm(PostExportPriority);
3759 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3760 .
addReg(AMDGPU::SGPR_NULL)
3780 const SIInstrInfo *TII = ST.getInstrInfo();
3792 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3797bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3798 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3801 const SIInstrInfo *TII = ST.getInstrInfo();
3803 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3805 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3806 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3812bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3815 if (!IsHazardRecognizerMode)
3818 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3819 const SIInstrInfo *TII = ST.getInstrInfo();
3821 const int FlatScrBaseWaitStates = 10;
3823 bool ReadsFlatScrLo =
3824 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3825 bool ReadsFlatScrHi =
3826 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3832 ReadsFlatScrLo =
true;
3835 ReadsFlatScrHi =
true;
3840 const MachineRegisterInfo &MRI = MF.getRegInfo();
3843 DenseSet<const MachineBasicBlock *> Visited;
3845 return MI.modifiesRegister(
Reg, TRI);
3850 auto IsSGPRDef = [TII, TRI, &MRI](
const MachineInstr &
MI) ->
unsigned {
3851 if (!TII->isSALU(
MI) && !TII->isVALU(
MI))
3853 for (
const MachineOperand &MO :
MI.all_defs()) {
3854 if (TRI->isSGPRReg(MRI, MO.getReg()))
3860 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3861 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3862 unsigned Wait =
MI.getOperand(0).getImm();
3867 return SgprWrites >= FlatScrBaseWaitStates;
3870 return ::getWaitStatesSince(
3871 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3872 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3876 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3878 !IsRegDefHazard(AMDGPU::SGPR103)))
3882 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3893 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3894 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static unsigned getWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, const TargetSchedModel &SchedModel)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(GsymDataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const uint32_t IV[8]
unsigned get(InstCounterType T) const
BitVector & set()
Set all bits in the bitvector.
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Provide an instruction scheduling machine model to CodeGen passes.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
constexpr RegState getDeadRegState(bool B)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...