27#define DEBUG_TYPE "gcn-hazard-recognizer"
30 "Number of WMMA hazard V_NOPs hoisted from loops");
32 "Number of WMMA hazards where V_NOP hoisting was not possible");
36struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
39 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
41 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
44 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
54 cl::desc(
"Fill a percentage of the latency between "
55 "neighboring MFMA with s_nops."));
60 cl::desc(
"Insert a s_nop x before every instruction"));
64 cl::desc(
"Hoist WMMA hazard V_NOPs from loops to preheaders"));
75 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
76 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
77 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
78 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
79 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
84 EmittedInstrs.clear();
96 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
100 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
105 case AMDGPU::S_SETREG_B32:
106 case AMDGPU::S_SETREG_B32_mode:
107 case AMDGPU::S_SETREG_IMM32_B32:
108 case AMDGPU::S_SETREG_IMM32_B32_mode:
115 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
119 return Opcode == AMDGPU::S_RFE_B64;
124 case AMDGPU::S_MOVRELS_B32:
125 case AMDGPU::S_MOVRELS_B64:
126 case AMDGPU::S_MOVRELD_B32:
127 case AMDGPU::S_MOVRELD_B64:
136 if (
TII.isAlwaysGDS(
MI.getOpcode()))
139 switch (
MI.getOpcode()) {
140 case AMDGPU::S_SENDMSG:
141 case AMDGPU::S_SENDMSGHALT:
142 case AMDGPU::S_TTRACEDATA:
146 case AMDGPU::DS_PERMUTE_B32:
147 case AMDGPU::DS_BPERMUTE_B32:
150 if (
TII.isDS(
MI.getOpcode())) {
151 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
152 AMDGPU::OpName::gds);
153 if (
MI.getOperand(GDS).getImm())
161 unsigned Opcode =
MI.getOpcode();
162 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
163 Opcode == AMDGPU::V_PERMLANE64_B32 ||
164 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
165 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
171 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
185 AMDGPU::OpName::simm16);
202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
205 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
209 if (!IsHazardRecognizerMode) {
210 if (checkWMMACoexecutionHazards(
MI) > 0)
214 if (ST.hasNoDataDepHazard())
226 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
229 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
234 checkMAIVALUHazards(
MI) > 0)
237 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
240 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
243 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
246 if (((ST.hasReadM0MovRelInterpHazard() &&
248 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
249 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
251 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
252 (ST.hasReadM0LdsDirectHazard() &&
253 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
254 checkReadM0Hazards(
MI) > 0)
261 checkMAILdStHazards(
MI) > 0)
264 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
272 while (Quantity > 0) {
273 unsigned Arg = std::min(Quantity, 8u);
281GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
282 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
283 assert(TSchedModel.getWriteProcResBegin(SC) !=
284 TSchedModel.getWriteProcResEnd(SC));
285 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
288void GCNHazardRecognizer::processBundle() {
292 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
293 CurrCycleInstr = &*
MI;
296 if (IsHazardRecognizerMode) {
297 fixHazards(CurrCycleInstr);
305 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
306 EmittedInstrs.push_front(
nullptr);
308 EmittedInstrs.push_front(CurrCycleInstr);
311 CurrCycleInstr =
nullptr;
315 assert(IsHazardRecognizerMode);
319 if (
MI->isInsideBundle())
329 IsHazardRecognizerMode =
true;
333 CurrCycleInstr =
nullptr;
344 return std::max(WaitStates, checkSMRDHazards(
MI));
346 if (ST.hasNSAtoVMEMBug())
347 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
349 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
351 if (ST.hasNoDataDepHazard())
355 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
358 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
361 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
364 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
367 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
371 checkMAIVALUHazards(
MI) > 0)
372 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
374 if (
MI->isInlineAsm())
375 return std::max(WaitStates, checkInlineAsmHazards(
MI));
378 return std::max(WaitStates, checkGetRegHazards(
MI));
381 return std::max(WaitStates, checkSetRegHazards(
MI));
384 return std::max(WaitStates, checkRFEHazards(
MI));
386 if ((ST.hasReadM0MovRelInterpHazard() &&
388 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
389 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
391 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
392 (ST.hasReadM0LdsDirectHazard() &&
393 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
394 return std::max(WaitStates, checkReadM0Hazards(
MI));
397 return std::max(WaitStates, checkMAIHazards(
MI));
400 return std::max(WaitStates, checkMAILdStHazards(
MI));
403 return std::max(WaitStates, checkPermlaneHazards(
MI));
409 EmittedInstrs.push_front(
nullptr);
415 if (!CurrCycleInstr) {
416 EmittedInstrs.push_front(
nullptr);
420 if (CurrCycleInstr->isBundle()) {
425 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
426 if (!NumWaitStates) {
427 CurrCycleInstr =
nullptr;
432 EmittedInstrs.push_front(CurrCycleInstr);
439 EmittedInstrs.push_front(
nullptr);
447 CurrCycleInstr =
nullptr;
451 assert(!IsHazardRecognizerMode &&
452 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
462template <
typename StateT>
472 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
477 static inline StateMapKey getEmptyKey() {
482 static inline StateMapKey getTombstoneKey() {
487 static unsigned getHashValue(
const StateMapKey &
Key) {
488 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
490 static unsigned getHashValue(
const StateT &State) {
491 return StateT::getHashValue(State);
493 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
494 const auto EKey = getEmptyKey();
495 const auto TKey = getTombstoneKey();
496 if (StateMapKey::isEqual(
LHS, EKey) || StateMapKey::isEqual(
RHS, EKey) ||
497 StateMapKey::isEqual(
LHS, TKey) || StateMapKey::isEqual(
RHS, TKey))
498 return StateMapKey::isEqual(
LHS,
RHS);
499 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
501 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
502 if (StateMapKey::isEqual(
RHS, getEmptyKey()) ||
503 StateMapKey::isEqual(
RHS, getTombstoneKey()))
505 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
514 StateT State = InitialState;
517 unsigned WorkIdx = 0;
519 bool Expired =
false;
520 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
525 auto Result = IsHazard(State, *
I);
533 if (
I->isInlineAsm() ||
I->isMetaInstruction())
536 UpdateState(State, *
I);
540 unsigned StateIdx = States.
size();
541 StateMapKey
Key = {&States, StateIdx};
542 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
543 if (Insertion.second) {
546 StateIdx = Insertion.first->second;
549 Worklist.
insert(std::pair(Pred, StateIdx));
552 if (WorkIdx == Worklist.
size())
556 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
557 State = States[StateIdx];
558 I =
MBB->instr_rbegin();
575 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
583 if (
I->isInlineAsm())
586 WaitStates += GetNumWaitStates(*
I);
588 if (IsExpired(*
I, WaitStates))
589 return std::numeric_limits<int>::max();
592 int MinWaitStates = std::numeric_limits<int>::max();
594 if (!Visited.
insert(Pred).second)
598 IsExpired, Visited, GetNumWaitStates);
600 MinWaitStates = std::min(MinWaitStates, W);
603 return MinWaitStates;
614 std::next(
MI->getReverseIterator()), 0, IsExpired,
615 Visited, GetNumWaitStates);
618int GCNHazardRecognizer::getWaitStatesSince(
619 IsHazardFn IsHazard,
int Limit, GetNumWaitStatesFn GetNumWaitStates)
const {
620 if (IsHazardRecognizerMode) {
621 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
622 return WaitStates >= Limit;
624 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn,
629 for (MachineInstr *
MI : EmittedInstrs) {
634 if (
MI->isInlineAsm())
637 WaitStates +=
MI ? GetNumWaitStates(*
MI) : 1;
639 if (WaitStates >= Limit)
642 return std::numeric_limits<int>::max();
645int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
650int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
651 IsHazardFn IsHazardDef,
653 const SIRegisterInfo *TRI = ST.getRegisterInfo();
656 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
662int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
677 for (MCRegUnit Unit :
TRI.regunits(
Reg))
678 BV.
set(
static_cast<unsigned>(Unit));
690void GCNHazardRecognizer::addClauseInst(
const MachineInstr &
MI)
const {
702int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM)
const {
705 if (!ST.isXNACKEnabled())
708 bool IsSMRD = TII.isSMRD(*MEM);
722 for (MachineInstr *
MI : EmittedInstrs) {
734 if (ClauseDefs.none())
747 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
750int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD)
const {
751 int WaitStatesNeeded = 0;
753 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
756 if (!ST.hasSMRDReadVALUDefHazard())
757 return WaitStatesNeeded;
761 int SmrdSgprWaitStates = 4;
762 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
763 return TII.isVALU(
MI);
765 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
766 return TII.isSALU(
MI);
769 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
771 for (
const MachineOperand &Use :
SMRD->uses()) {
774 int WaitStatesNeededForUse =
775 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
777 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
787 int WaitStatesNeededForUse =
788 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
791 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
795 return WaitStatesNeeded;
798int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr *VMEM)
const {
799 if (!ST.hasVMEMReadSGPRVALUDefHazard())
802 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
806 const int VmemSgprWaitStates = 5;
807 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
808 return TII.isVALU(
MI);
810 for (
const MachineOperand &Use : VMEM->uses()) {
811 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
814 int WaitStatesNeededForUse =
815 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
817 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
819 return WaitStatesNeeded;
823 const SIRegisterInfo *TRI = ST.getRegisterInfo();
824 const SIInstrInfo *TII = ST.getInstrInfo();
827 int DppVgprWaitStates = 2;
828 int DppExecWaitStates = 5;
829 int WaitStatesNeeded = 0;
830 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
831 return TII->isVALU(
MI);
834 for (
const MachineOperand &Use :
DPP->uses()) {
835 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
837 int WaitStatesNeededForUse =
838 DppVgprWaitStates - getWaitStatesSinceDef(
840 [](
const MachineInstr &) { return true; },
842 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
845 WaitStatesNeeded = std::max(
847 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
850 return WaitStatesNeeded;
853int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas)
const {
854 const SIInstrInfo *TII = ST.getInstrInfo();
858 const int DivFMasWaitStates = 4;
859 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
860 return TII->isVALU(
MI);
862 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
865 return DivFMasWaitStates - WaitStatesNeeded;
868int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr)
const {
869 const SIInstrInfo *TII = ST.getInstrInfo();
870 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
872 const int GetRegWaitStates = 2;
873 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
876 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
878 return GetRegWaitStates - WaitStatesNeeded;
881int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr)
const {
882 const SIInstrInfo *TII = ST.getInstrInfo();
883 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
885 const int SetRegWaitStates = ST.getSetRegWaitStates();
886 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
889 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
890 return SetRegWaitStates - WaitStatesNeeded;
893int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI)
const {
897 const SIInstrInfo *TII = ST.getInstrInfo();
898 unsigned Opcode =
MI.getOpcode();
899 const MCInstrDesc &
Desc =
MI.getDesc();
901 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
904 VDataRCID = TII->getOpRegClassID(
Desc.operands()[VDataIdx]);
906 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
913 const MachineOperand *SOffset =
914 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
918 (!SOffset || !SOffset->
isReg()))
926 if (TII->isMIMG(
MI)) {
927 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
929 Desc.operands()[SRsrcIdx])) == 256);
933 if (TII->isFLAT(
MI)) {
945int GCNHazardRecognizer::checkVALUHazardsHelper(
949 const SIRegisterInfo *TRI = ST.getRegisterInfo();
951 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
952 int WaitStatesNeeded = 0;
954 if (!TRI->isVectorRegister(MRI,
Def.getReg()))
955 return WaitStatesNeeded;
958 int DataIdx = createsVALUHazard(
MI);
959 return DataIdx >= 0 &&
960 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg);
963 int WaitStatesNeededForDef =
964 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
965 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
967 return WaitStatesNeeded;
983 unsigned Opcode =
MI.getOpcode();
993 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
995 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1001 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
1003 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1007 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
1009 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1015 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1036 for (
auto &Operand : VALU->operands()) {
1037 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1044int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU)
const {
1045 int WaitStatesNeeded = 0;
1048 const int TransDefWaitstates = 1;
1050 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1053 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1054 const SIInstrInfo *TII = ST.getInstrInfo();
1055 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1057 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1058 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1065 int WaitStatesNeededForDef =
1066 TransDefWaitstates -
1067 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1068 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1071 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1072 const int Shift16DefWaitstates = 1;
1074 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1075 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1076 const MachineOperand *ForwardedDst =
1082 if (ProducerMI.isInlineAsm()) {
1084 for (
auto &Def : ProducerMI.all_defs()) {
1093 int WaitStatesNeededForDef =
1094 Shift16DefWaitstates -
1095 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1096 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1099 if (ST.hasVDecCoExecHazard()) {
1100 const int VALUWriteSGPRVALUReadWaitstates = 2;
1101 const int VALUWriteEXECRWLane = 4;
1102 const int VALUWriteVGPRReadlaneRead = 1;
1104 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1105 const MachineRegisterInfo &MRI = MF.getRegInfo();
1107 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1110 return MI.modifiesRegister(
UseReg, TRI);
1113 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1118 if (TRI->isSGPRReg(MRI,
UseReg)) {
1119 int WaitStatesNeededForDef =
1120 VALUWriteSGPRVALUReadWaitstates -
1121 getWaitStatesSince(IsVALUDefSGPRFn,
1122 VALUWriteSGPRVALUReadWaitstates);
1123 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1127 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1129 int WaitStatesNeededForDef =
1130 VALUWriteSGPRVALUReadWaitstates -
1131 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1132 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1135 switch (
VALU->getOpcode()) {
1136 case AMDGPU::V_READLANE_B32:
1137 case AMDGPU::V_READFIRSTLANE_B32: {
1138 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1140 int WaitStatesNeededForDef =
1141 VALUWriteVGPRReadlaneRead -
1142 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1143 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1146 case AMDGPU::V_WRITELANE_B32: {
1148 int WaitStatesNeededForDef =
1149 VALUWriteEXECRWLane -
1150 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1151 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1161 if (!ST.has12DWordStoreHazard())
1162 return WaitStatesNeeded;
1164 const MachineRegisterInfo &MRI = MF.getRegInfo();
1166 for (
const MachineOperand &Def :
VALU->defs()) {
1167 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1170 return WaitStatesNeeded;
1173int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA)
const {
1182 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1183 !ST.hasCvtScaleForwardingHazard())
1186 const MachineRegisterInfo &MRI = MF.getRegInfo();
1187 int WaitStatesNeeded = 0;
1189 for (
const MachineOperand &
Op :
1191 if (
Op.isReg() &&
Op.isDef()) {
1192 if (!TRI.isVectorRegister(MRI,
Op.getReg()))
1195 if (ST.has12DWordStoreHazard()) {
1197 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op, MRI));
1202 if (ST.hasDstSelForwardingHazard()) {
1203 const int Shift16DefWaitstates = 1;
1205 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1209 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1210 IA->readsRegister(Dst->getReg(), &TRI);
1212 if (ProducerMI.isInlineAsm()) {
1214 for (
auto &Def : ProducerMI.all_defs()) {
1215 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1216 IA->readsRegister(
Def.getReg(), &TRI)) {
1225 int WaitStatesNeededForDef =
1226 Shift16DefWaitstates -
1227 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1228 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1231 return WaitStatesNeeded;
1234int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane)
const {
1235 const SIInstrInfo *TII = ST.getInstrInfo();
1236 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1237 const MachineRegisterInfo &MRI = MF.getRegInfo();
1239 const MachineOperand *LaneSelectOp =
1240 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1242 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->
getReg()))
1246 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isVALU(
MI); };
1248 const int RWLaneWaitStates = 4;
1249 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1251 return RWLaneWaitStates - WaitStatesSince;
1254int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE)
const {
1255 if (!ST.hasRFEHazards())
1258 const SIInstrInfo *TII = ST.getInstrInfo();
1260 const int RFEWaitStates = 1;
1265 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1266 return RFEWaitStates - WaitStatesNeeded;
1269int GCNHazardRecognizer::checkReadM0Hazards(
MachineInstr *
MI)
const {
1270 const SIInstrInfo *TII = ST.getInstrInfo();
1271 const int ReadM0WaitStates = 1;
1272 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1273 return ReadM0WaitStates -
1274 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1279 int WaitStatesNeeded,
bool IsHoisting) {
1281 for (
int I = 0;
I < WaitStatesNeeded; ++
I)
1282 BuildMI(
MBB, InsertPt,
DL, TII.get(AMDGPU::V_NOP_e32));
1286 fixVMEMtoScalarWriteHazards(
MI);
1287 fixVcmpxPermlaneHazards(
MI);
1288 fixSMEMtoVectorWriteHazards(
MI);
1289 fixVcmpxExecWARHazard(
MI);
1290 fixLdsBranchVmemWARHazard(
MI);
1291 if (ST.hasLdsDirect()) {
1292 fixLdsDirectVALUHazard(
MI);
1293 fixLdsDirectVMEMHazard(
MI);
1295 fixVALUPartialForwardingHazard(
MI);
1296 fixVALUTransUseHazard(
MI);
1297 fixVALUTransCoexecutionHazards(
MI);
1299 fixWMMACoexecutionHazards(
MI);
1300 fixShift64HighRegBug(
MI);
1301 fixVALUMaskWriteHazard(
MI);
1302 fixRequiredExportPriority(
MI);
1303 if (ST.requiresWaitIdleBeforeGetReg())
1304 fixGetRegWaitIdle(
MI);
1305 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1306 fixDsAtomicAsyncBarrierArriveB64(
MI);
1307 if (ST.hasScratchBaseForwardingHazard())
1308 fixScratchBaseForwardingHazard(
MI);
1309 if (ST.setRegModeNeedsVNOPs())
1315 return (
TII.isVOPC(
MI) ||
1316 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1317 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1320bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1324 const SIInstrInfo *TII = ST.getInstrInfo();
1325 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1331 unsigned Opc =
MI.getOpcode();
1333 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1337 std::numeric_limits<int>::max())
1343 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1345 bool IsUndef = Src0->isUndef();
1347 TII->get(AMDGPU::V_MOV_B32_e32))
1354bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1355 if (!ST.hasVMEMtoScalarWriteHazard())
1357 assert(!ST.hasExtendedWaitCounts());
1362 if (
MI->getNumDefs() == 0)
1365 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1371 for (
const MachineOperand &Def :
MI->defs()) {
1372 const MachineOperand *
Op =
1373 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1383 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1384 !
MI.getOperand(0).getImm()) ||
1385 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1390 std::numeric_limits<int>::max())
1393 const SIInstrInfo *TII = ST.getInstrInfo();
1395 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1400bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1401 if (!ST.hasSMEMtoVectorWriteHazard())
1403 assert(!ST.hasExtendedWaitCounts());
1408 AMDGPU::OpName SDSTName;
1409 switch (
MI->getOpcode()) {
1410 case AMDGPU::V_READLANE_B32:
1411 case AMDGPU::V_READFIRSTLANE_B32:
1412 SDSTName = AMDGPU::OpName::vdst;
1415 SDSTName = AMDGPU::OpName::sdst;
1419 const SIInstrInfo *TII = ST.getInstrInfo();
1420 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1422 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1424 for (
const auto &MO :
MI->implicit_operands()) {
1425 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1436 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1441 if (TII->isSALU(
MI)) {
1442 switch (
MI.getOpcode()) {
1443 case AMDGPU::S_SETVSKIP:
1444 case AMDGPU::S_VERSION:
1445 case AMDGPU::S_WAITCNT_VSCNT:
1446 case AMDGPU::S_WAITCNT_VMCNT:
1447 case AMDGPU::S_WAITCNT_EXPCNT:
1450 case AMDGPU::S_WAITCNT_LGKMCNT:
1452 return (
MI.getOperand(1).getImm() == 0) &&
1453 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1454 case AMDGPU::S_WAITCNT: {
1455 const int64_t
Imm =
MI.getOperand(0).getImm();
1462 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1463 "unexpected wait count instruction");
1465 if (TII->isSOPP(
MI))
1481 std::numeric_limits<int>::max())
1485 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1490bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1491 if (!ST.hasVcmpxExecWARHazard())
1493 assert(!ST.hasExtendedWaitCounts());
1498 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1499 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1505 return I.readsRegister(AMDGPU::EXEC, TRI);
1508 const SIInstrInfo *TII = ST.getInstrInfo();
1509 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1511 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1513 for (
auto MO :
MI.implicit_operands())
1514 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1517 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1524 std::numeric_limits<int>::max())
1528 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1535 if (!ST.hasLdsBranchVmemWARHazard())
1540 bool HasLds =
false;
1541 bool HasVmem =
false;
1542 for (
auto &
MBB : MF) {
1543 for (
auto &
MI :
MBB) {
1546 if (HasLds && HasVmem)
1554 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1555 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1556 !
I.getOperand(1).getImm();
1559bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1560 if (!RunLdsBranchVmemWARHazardFixup)
1563 assert(ST.hasLdsBranchVmemWARHazard());
1564 assert(!ST.hasExtendedWaitCounts());
1566 auto IsHazardInst = [](
const MachineInstr &
MI) {
1574 auto InstType = IsHazardInst(*
MI);
1578 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1582 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1586 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1587 auto InstType2 = IsHazardInst(
I);
1588 return InstType2 && InstType != InstType2;
1591 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1592 auto InstType2 = IsHazardInst(
I);
1593 if (InstType == InstType2)
1600 std::numeric_limits<int>::max();
1604 std::numeric_limits<int>::max())
1607 const SIInstrInfo *TII = ST.getInstrInfo();
1609 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1616bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1620 const int NoHazardWaitStates = 15;
1621 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1624 bool VisitedTrans =
false;
1625 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1630 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1632 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1633 if (WaitStates >= NoHazardWaitStates)
1639 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1643 DenseSet<const MachineBasicBlock *> Visited;
1645 std::next(
MI->getReverseIterator()), 0,
1653 MachineOperand *WaitVdstOp =
1654 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1655 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1660bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1664 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1667 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1670 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1672 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1675 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1677 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1678 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1681 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1685 std::numeric_limits<int>::max())
1688 if (LdsdirCanWait) {
1689 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1692 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1699bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1700 if (!ST.hasVALUPartialForwardingHazard())
1702 assert(!ST.hasExtendedWaitCounts());
1707 SmallSetVector<Register, 4> SrcVGPRs;
1709 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1710 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1715 if (SrcVGPRs.
size() <= 1)
1733 const int Intv1plus2MaxVALUs = 2;
1734 const int Intv3MaxVALUs = 4;
1735 const int IntvMaxVALUs = 6;
1736 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1739 SmallDenseMap<Register, int, 4> DefPos;
1740 int ExecPos = std::numeric_limits<int>::max();
1743 static unsigned getHashValue(
const StateType &State) {
1747 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1748 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1756 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1758 if (State.VALUs > NoHazardVALUWaitStates)
1764 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1772 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1773 State.DefPos[Src] = State.VALUs;
1778 if (State.ExecPos == std::numeric_limits<int>::max()) {
1779 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1780 State.ExecPos = State.VALUs;
1787 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1795 if (State.ExecPos == std::numeric_limits<int>::max())
1798 int PreExecPos = std::numeric_limits<int>::max();
1799 int PostExecPos = std::numeric_limits<int>::max();
1801 for (
auto Entry : State.DefPos) {
1802 int DefVALUs =
Entry.second;
1803 if (DefVALUs != std::numeric_limits<int>::max()) {
1804 if (DefVALUs >= State.ExecPos)
1805 PreExecPos = std::min(PreExecPos, DefVALUs);
1807 PostExecPos = std::min(PostExecPos, DefVALUs);
1812 if (PostExecPos == std::numeric_limits<int>::max())
1816 int Intv3VALUs = PostExecPos;
1817 if (Intv3VALUs > Intv3MaxVALUs)
1821 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1822 if (Intv2VALUs > Intv1plus2MaxVALUs)
1826 if (PreExecPos == std::numeric_limits<int>::max())
1830 int Intv1VALUs = PreExecPos - State.ExecPos;
1831 if (Intv1VALUs > Intv1plus2MaxVALUs)
1835 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1840 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1846 std::next(
MI->getReverseIterator())))
1850 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1856bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1857 if (!ST.hasVALUTransUseHazard())
1859 assert(!ST.hasExtendedWaitCounts());
1864 SmallSet<Register, 4> SrcVGPRs;
1866 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1867 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1881 const int IntvMaxVALUs = 5;
1882 const int IntvMaxTRANS = 1;
1888 static unsigned getHashValue(
const StateType &State) {
1891 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1892 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1899 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1901 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1907 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1914 if (
I.modifiesRegister(Src, &TRI)) {
1922 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1930 std::next(
MI->getReverseIterator())))
1936 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1942bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1943 if (!ST.hasGFX1250Insts() ||
1947 const SIInstrInfo *TII = ST.getInstrInfo();
1948 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1950 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
1955 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1956 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
1957 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1961 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1962 if (!ValuDst || !ValuDst->isReg())
1966 Register ValuDef = ValuDst->getReg();
1967 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
1968 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1979 const int HasVALU = std::numeric_limits<int>::max();
1980 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
1983 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1991 const SIInstrInfo *TII = ST.getInstrInfo();
1992 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1994 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
2001 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2003 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2006 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2008 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2009 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2018 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2019 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2033 std::numeric_limits<int>::max())
2036 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2048 unsigned Category) {
2050 "Handle me if the xdl wmma instruction latency changes");
2087int GCNHazardRecognizer::checkWMMACoexecutionHazards(
MachineInstr *
MI)
const {
2088 if (!ST.hasGFX1250Insts())
2091 const SIInstrInfo *TII = ST.getInstrInfo();
2100 const int WMMAWaitStates[] = {5, 9, 3, 5};
2101 const int VALUWaitStates[] = {4, 8, 2, 4};
2102 unsigned Category = 0;
2104 auto IsWMMAHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2105 if (!TII->isXDLWMMA(
I))
2108 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2112 return hasWMMAToWMMARegOverlap(
I, *
MI);
2115 auto IsVALUHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2116 if (!TII->isXDLWMMA(
I))
2119 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2123 return hasWMMAToVALURegOverlap(
I, *
MI);
2128 auto GetWaitStatesFn = [](
const MachineInstr &
I) {
2132 int WaitStatesNeeded = -1;
2133 if (TII->isXDLWMMA(*
MI)) {
2134 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2135 Limit = WMMAWaitStates[Category];
2141 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2144 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2145 Limit = VALUWaitStates[Category];
2151 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2155 return WaitStatesNeeded;
2158bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2160 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2161 Register A1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src0)->getReg();
2162 Register B1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src1)->getReg();
2165 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2169 Register Idx1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2170 if (TRI.regsOverlap(D0, Idx1))
2176bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2179 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2180 for (
const MachineOperand &ValuUse :
MI.explicit_uses()) {
2181 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2186 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2187 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2191 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2192 WMMARegs.push_back(Idx0);
2195 for (
const MachineOperand &ValuDef :
MI.defs()) {
2196 Register VDstReg = ValuDef.getReg();
2197 for (
Register WMMAReg : WMMARegs) {
2198 if (TRI.regsOverlap(VDstReg, WMMAReg))
2205bool GCNHazardRecognizer::isCoexecutionHazardFor(
const MachineInstr &
I,
2209 if (!TII.isXDLWMMA(
I))
2213 if (TII.isXDLWMMA(
MI))
2214 return hasWMMAToWMMARegOverlap(
I,
MI);
2216 return hasWMMAToVALURegOverlap(
I,
MI);
2222 bool IncludeSubloops) {
2225 for (MachineBasicBlock *
MBB :
L->getBlocks()) {
2226 if (!IncludeSubloops && MLI->getLoopFor(
MBB) != L)
2228 for (MachineInstr &
I : *
MBB) {
2231 if (isCoexecutionHazardFor(
I, *
MI))
2238bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(
MachineInstr *
MI,
2239 int WaitStatesNeeded) {
2243 MachineLoop *
L = MLI->getLoopFor(
MI->getParent());
2245 ++NumWMMAHoistingBailed;
2250 if (hasWMMAHazardInLoop(L,
MI)) {
2251 ++NumWMMAHoistingBailed;
2256 MachineLoop *TargetLoop =
L;
2258 if (hasWMMAHazardInLoop(Parent,
MI,
false))
2260 TargetLoop = Parent;
2266 ++NumWMMAHoistingBailed;
2270 LLVM_DEBUG(
dbgs() <<
"WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2276 NumWMMANopsHoisted += WaitStatesNeeded;
2280bool GCNHazardRecognizer::fixWMMACoexecutionHazards(
MachineInstr *
MI) {
2281 int WaitStatesNeeded = checkWMMACoexecutionHazards(
MI);
2282 if (WaitStatesNeeded <= 0)
2288 emitVNops(*
MI->getParent(),
MI->getIterator(), WaitStatesNeeded);
2292bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2293 if (!ST.hasShift64HighRegBug())
2295 assert(!ST.hasExtendedWaitCounts());
2297 switch (
MI->getOpcode()) {
2300 case AMDGPU::V_LSHLREV_B64_e64:
2301 case AMDGPU::V_LSHRREV_B64_e64:
2302 case AMDGPU::V_ASHRREV_I64_e64:
2306 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2311 const MachineRegisterInfo &MRI = MF.getRegInfo();
2313 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2316 if (AmtReg != AMDGPU::VGPR255 && MRI.
isPhysRegUsed(AmtReg + 1))
2319 assert(ST.needsAlignedVGPRs());
2320 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2323 MachineBasicBlock *
MBB =
MI->getParent();
2324 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2335 Register DstReg =
MI->getOperand(0).getReg();
2337 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2345 bool Overlapped =
MI->modifiesRegister(AmtReg, &TRI);
2347 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2348 : AMDGPU::VGPR_32RegClass) {
2349 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2355 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2360 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2373 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2380 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2386 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2400 MI->getOperand(0).setReg(NewReg);
2409int GCNHazardRecognizer::checkNSAtoVMEMHazard(
MachineInstr *
MI)
const {
2410 int NSAtoVMEMWaitStates = 1;
2412 if (!ST.hasNSAtoVMEMBug())
2418 const SIInstrInfo *TII = ST.getInstrInfo();
2419 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2427 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2428 TII->getInstSizeInBytes(
I) >= 16;
2431 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2434int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2436 int FPAtomicToDenormModeWaitStates = 3;
2438 if (!ST.hasFPAtomicToDenormModeHazard())
2440 assert(!ST.hasExtendedWaitCounts());
2442 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2451 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2458 return FPAtomicToDenormModeWaitStates -
2462int GCNHazardRecognizer::checkMAIHazards(
MachineInstr *
MI)
const {
2465 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2468int GCNHazardRecognizer::checkMFMAPadding(
MachineInstr *
MI)
const {
2473 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2477 int NeighborMFMALatency = 0;
2478 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2479 this](
const MachineInstr &
MI) {
2483 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2487 const int MaxMFMAPipelineWaitStates = 16;
2488 int WaitStatesSinceNeighborMFMA =
2489 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2491 int NeighborMFMAPaddingNeeded =
2493 WaitStatesSinceNeighborMFMA;
2495 return std::max(0, NeighborMFMAPaddingNeeded);
2498int GCNHazardRecognizer::checkMAIHazards908(
MachineInstr *
MI)
const {
2499 int WaitStatesNeeded = 0;
2500 unsigned Opc =
MI->getOpcode();
2502 auto IsVALUFn = [](
const MachineInstr &
MI) {
2506 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2507 const int LegacyVALUWritesVGPRWaitStates = 2;
2508 const int VALUWritesExecWaitStates = 4;
2509 const int MaxWaitStates = 4;
2511 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2512 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2513 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2515 if (WaitStatesNeeded < MaxWaitStates) {
2516 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2517 const int MaxWaitStates = 2;
2519 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2522 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2523 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2524 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2526 if (WaitStatesNeeded == MaxWaitStates)
2532 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2533 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2536 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2539 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2540 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2541 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2542 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2543 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2544 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2545 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2546 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2547 const int MaxWaitStates = 18;
2549 unsigned HazardDefLatency = 0;
2551 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2552 this](
const MachineInstr &
MI) {
2559 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2560 return TRI.regsOverlap(DstReg,
Reg);
2563 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2565 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2566 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2567 int OpNo =
Op.getOperandNo();
2568 if (OpNo == SrcCIdx) {
2569 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2570 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2571 switch (HazardDefLatency) {
2572 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2574 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2576 case 16: [[fallthrough]];
2577 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2580 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2581 switch (HazardDefLatency) {
2582 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2584 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2586 case 16: [[fallthrough]];
2587 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2592 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2593 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2595 if (WaitStatesNeeded == MaxWaitStates)
2596 return WaitStatesNeeded;
2598 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2599 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2602 return TRI.regsOverlap(
Reg, DstReg);
2605 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2606 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2607 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2608 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2609 if (OpNo == SrcCIdx)
2610 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2611 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2612 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2614 WaitStatesNeededForUse = NeedWaitStates -
2615 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2616 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2618 if (WaitStatesNeeded == MaxWaitStates)
2619 return WaitStatesNeeded;
2622 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2623 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2624 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2625 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2626 const int MaxWaitStates = 13;
2627 Register DstReg =
MI->getOperand(0).getReg();
2628 unsigned HazardDefLatency = 0;
2630 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2631 this](
const MachineInstr &
MI) {
2634 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2636 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2637 return TRI.regsOverlap(
Reg, DstReg);
2640 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2642 switch (HazardDefLatency) {
2643 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2645 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2647 case 16: [[fallthrough]];
2648 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2652 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2653 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2657 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2659 return WaitStatesNeeded;
2670 return NumPasses + 1 + IsGFX950;
2681 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2699 return NumPasses + 2;
2709 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2712int GCNHazardRecognizer::checkMAIHazards90A(
MachineInstr *
MI)
const {
2713 int WaitStatesNeeded = 0;
2714 unsigned Opc =
MI->getOpcode();
2716 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2720 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2726 return WaitStatesNeeded;
2728 const int VALUWritesExecWaitStates = 4;
2729 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2730 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2731 VALUWritesExecWaitStates);
2732 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2734 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2737 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2738 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2739 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2740 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2741 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2742 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2743 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2744 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2745 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2746 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2747 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2748 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2749 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2750 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2751 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2752 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2753 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2754 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2755 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2756 const int MaxWaitStates = 19;
2762 const MachineInstr *MI1;
2764 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2765 this](
const MachineInstr &
MI) {
2769 FullReg = (DstReg ==
Reg);
2771 return TRI.regsOverlap(DstReg,
Reg);
2774 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2775 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2776 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2779 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2780 if (NumWaitStates == std::numeric_limits<int>::max())
2783 int OpNo =
Use.getOperandNo();
2785 int NeedWaitStates = 0;
2786 if (OpNo == SrcCIdx) {
2790 }
else if (FullReg) {
2791 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2792 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2793 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2794 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2795 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2796 else if (ST.hasGFX940Insts() &&
2797 TSchedModel.computeInstrLatency(MI1) == 2)
2798 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2801 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2802 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2803 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2804 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2805 if (!TII.isXDL(*
MI))
2808 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2809 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2811 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2812 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2813 if (!TII.isXDL(*
MI))
2814 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2817 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2818 if (ST.hasGFX940Insts()) {
2819 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2826 NumPasses, ST.hasGFX950Insts())
2828 NumPasses, ST.hasGFX950Insts()))
2834 switch (NumPasses) {
2838 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2839 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2844 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2845 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2850 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2851 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2860 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2861 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2862 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2863 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2866 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2867 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2869 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2870 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2871 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2874 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2876 if (ST.hasGFX940Insts()) {
2880 NumPasses, ST.hasGFX950Insts())
2886 switch (NumPasses) {
2888 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2893 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2897 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2901 if (WaitStatesNeeded >= NeedWaitStates)
2904 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2905 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2907 if (WaitStatesNeeded == MaxWaitStates)
2912 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2914 return WaitStatesNeeded;
2917int GCNHazardRecognizer::checkMAILdStHazards(
MachineInstr *
MI)
const {
2919 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2922 int WaitStatesNeeded = 0;
2924 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
2925 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2928 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2929 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
2934 const int AccVgprReadLdStWaitStates = 2;
2935 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2936 const int MaxWaitStates = 2;
2938 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2939 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
2940 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2942 if (WaitStatesNeeded == MaxWaitStates)
2943 return WaitStatesNeeded;
2945 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
2946 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2947 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2949 auto IsVALUFn = [](
const MachineInstr &
MI) {
2952 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
2953 std::numeric_limits<int>::max();
2956 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2957 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2958 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2961 return WaitStatesNeeded;
2964int GCNHazardRecognizer::checkPermlaneHazards(
MachineInstr *
MI)
const {
2965 assert(!ST.hasVcmpxPermlaneHazard() &&
2966 "this is a different vcmpx+permlane hazard");
2967 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2968 const SIInstrInfo *TII = ST.getInstrInfo();
2970 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
2974 auto IsVALUFn = [](
const MachineInstr &
MI) {
2978 const int VCmpXWritesExecWaitStates = 4;
2979 const int VALUWritesVDstWaitStates = 2;
2980 int WaitStatesNeeded = 0;
2982 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2983 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
2987 int WaitStatesSinceDef =
2988 VALUWritesVDstWaitStates -
2989 getWaitStatesSinceDef(
Reg, IsVALUFn,
2990 VALUWritesVDstWaitStates);
2991 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2992 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2996 int VCmpXHazardWaits =
2997 VCmpXWritesExecWaitStates -
2998 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3000 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3001 return WaitStatesNeeded;
3009 return NumPasses + 2;
3019 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3029 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3037 return NumPasses + 2;
3040int GCNHazardRecognizer::checkMAIVALUHazards(
MachineInstr *
MI)
const {
3041 if (!ST.hasGFX90AInsts())
3044 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
3052 const MachineRegisterInfo &MRI = MF.getRegInfo();
3054 int WaitStatesNeeded = 0;
3060 const MachineInstr *
MFMA =
nullptr;
3062 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3064 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3070 const MachineInstr *
DOT =
nullptr;
3071 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
3073 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3079 bool DGEMMAfterVALUWrite =
false;
3080 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
3083 DGEMMAfterVALUWrite =
true;
3087 if (!TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
3093 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
3094 AMDGPU::OpName::src2);
3096 if (IsMemOrExport || IsVALU) {
3097 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3098 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3099 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3100 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3101 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3102 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3103 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3104 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3105 const int DotWriteSameDotReadSrcAB = 3;
3106 const int DotWriteDifferentVALURead = 3;
3107 const int DMFMABetweenVALUWriteVMEMRead = 2;
3108 const int MaxWaitStates = 19;
3110 for (
const MachineOperand &Use :
MI->explicit_uses()) {
3116 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3119 int NeedWaitStates = 0;
3120 if (
DOT->getOpcode() ==
MI->getOpcode()) {
3121 if (&Use - &
MI->getOperand(0) != SrcCIdx)
3122 NeedWaitStates = DotWriteSameDotReadSrcAB;
3124 NeedWaitStates = DotWriteDifferentVALURead;
3127 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3128 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3135 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3136 DGEMMAfterVALUWrite =
false;
3137 if (TRI.isVectorRegister(MRI,
Reg)) {
3138 int WaitStatesNeededForUse =
3139 DMFMABetweenVALUWriteVMEMRead -
3140 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3141 DMFMABetweenVALUWriteVMEMRead);
3143 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3148 WaitStatesSinceDef =
3149 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3153 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3154 int NumPasses = HazardDefLatency;
3155 int NeedWaitStates = MaxWaitStates;
3158 switch (HazardDefLatency) {
3160 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3161 : DMFMA4x4WriteVgprVALUReadWaitStates;
3167 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3168 : (ST.hasGFX950Insts()
3169 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3170 : DMFMA16x16WriteVgprVALUReadWaitStates);
3175 }
else if (ST.hasGFX940Insts()) {
3179 NumPasses, ST.hasGFX950Insts())
3183 switch (HazardDefLatency) {
3185 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3188 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3191 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3198 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3199 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3201 if (WaitStatesNeeded == MaxWaitStates)
3206 unsigned Opc =
MI->getOpcode();
3207 const int DMFMAToFMA64WaitStates = 2;
3208 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3209 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3210 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3211 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3212 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3213 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3214 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3217 if (!IsVALU && !IsMemOrExport)
3218 return WaitStatesNeeded;
3220 for (
const MachineOperand &Def :
MI->defs()) {
3221 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3222 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3223 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3224 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3225 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3226 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3227 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3228 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3229 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3230 const int DotWriteDifferentVALUWrite = 3;
3231 const int MaxWaitStates = 19;
3232 const int MaxWarWaitStates = 15;
3237 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3239 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3240 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3241 WaitStatesSinceDef);
3244 WaitStatesSinceDef =
3245 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3247 int NeedWaitStates = MaxWaitStates;
3248 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3251 switch (NumPasses) {
3253 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3257 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3262 }
else if (ST.hasGFX940Insts()) {
3266 NumPasses, ST.hasGFX950Insts())
3269 switch (NumPasses) {
3271 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3274 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3277 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3284 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3285 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3287 if (WaitStatesNeeded == MaxWaitStates)
3291 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3293 !
MI.readsRegister(
Reg, &TRI))
3296 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3299 const MachineOperand *SrcC =
3300 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3310 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3315 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3316 int NeedWaitStates = MaxWaitStates;
3317 switch (HazardDefLatency) {
3318 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3320 case 4:
assert(ST.hasGFX940Insts());
3321 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3323 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3325 case 16: [[fallthrough]];
3326 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3330 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3331 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3334 return WaitStatesNeeded;
3347 return MAI !=
nullptr;
3351 if (IsMFMAFn(*
MI)) {
3352 int W = getWaitStatesSince(IsMFMAFn, 16);
3354 return W < (int)TSchedModel.computeInstrLatency(MAI);
3368 while (
I->isBundledWithPred())
3374 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3378 const unsigned NewBytes = 4;
3380 "Unexpected instruction insertion in bundle");
3383 while (NextMI != End && NextMI->isBundledWithPred()) {
3384 for (
auto &Operand : NextMI->operands()) {
3385 if (Operand.isGlobal())
3386 Operand.setOffset(Operand.getOffset() + NewBytes);
3392bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3393 if (!ST.hasVALUMaskWriteHazard())
3395 assert(!ST.hasExtendedWaitCounts());
3402 if (!IsSALU && !IsVALU)
3414 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3415 const MachineRegisterInfo &MRI = MF.getRegInfo();
3420 case AMDGPU::EXEC_LO:
3421 case AMDGPU::EXEC_HI:
3423 case AMDGPU::SGPR_NULL:
3424 case AMDGPU::SGPR_NULL64:
3432 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
3436 SmallSet<Register, 2> HazardSGPRs;
3438 static unsigned getHashValue(
const StateType &State) {
3441 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
3442 return LHS.HazardSGPRs ==
RHS.HazardSGPRs;
3446 SmallVector<const MachineInstr *> WaitInstrs;
3447 bool HasSGPRRead =
false;
3448 StateType InitialState;
3451 MachineOperand *HazardDef =
nullptr;
3452 for (MachineOperand &
Op :
MI->operands()) {
3455 if (
Op.isDef() && HazardDef)
3459 if (IgnoreableSGPR(
Reg))
3462 if (
Op.isImplicit())
3464 if (!TRI->isSGPRReg(MRI,
Reg))
3482 if (AMDGPU::SReg_32RegClass.
contains(HazardReg)) {
3483 InitialState.HazardSGPRs.insert(HazardReg);
3486 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3487 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3490 auto IsHazardFn = [&](StateType &State,
const MachineInstr &
I) {
3491 if (State.HazardSGPRs.empty())
3494 switch (
I.getOpcode()) {
3495 case AMDGPU::V_ADDC_U32_e32:
3496 case AMDGPU::V_ADDC_U32_dpp:
3497 case AMDGPU::V_CNDMASK_B16_t16_e32:
3498 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3499 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3500 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3501 case AMDGPU::V_CNDMASK_B32_e32:
3502 case AMDGPU::V_CNDMASK_B32_dpp:
3503 case AMDGPU::V_DIV_FMAS_F32_e64:
3504 case AMDGPU::V_DIV_FMAS_F64_e64:
3505 case AMDGPU::V_SUBB_U32_e32:
3506 case AMDGPU::V_SUBB_U32_dpp:
3507 case AMDGPU::V_SUBBREV_U32_e32:
3508 case AMDGPU::V_SUBBREV_U32_dpp: {
3512 case AMDGPU::V_ADDC_U32_e64:
3513 case AMDGPU::V_ADDC_U32_e64_dpp:
3514 case AMDGPU::V_CNDMASK_B16_t16_e64:
3515 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3516 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3517 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3518 case AMDGPU::V_CNDMASK_B32_e64:
3519 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3520 case AMDGPU::V_SUBB_U32_e64:
3521 case AMDGPU::V_SUBB_U32_e64_dpp:
3522 case AMDGPU::V_SUBBREV_U32_e64:
3523 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3525 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3527 bool Result = TRI->regsOverlap(SSRCOp->
getReg(), HazardReg);
3539 auto UpdateStateFn = [&](StateType &State,
const MachineInstr &
I) {
3540 switch (
I.getOpcode()) {
3541 case AMDGPU::S_WAITCNT_DEPCTR:
3543 if (!HasSGPRRead &&
I.getParent() ==
MI->getParent() && !
I.isBundled() &&
3544 (
I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3549 for (
auto &
Op :
I.operands()) {
3554 if (IgnoreableSGPR(
Reg))
3557 if (
Op.isImplicit())
3559 if (!TRI->isSGPRReg(MRI,
Reg))
3570 for (
Register SGPR : State.HazardSGPRs) {
3571 if (
Reg == SGPR || TRI->regsOverlap(
Reg, SGPR))
3575 State.HazardSGPRs.erase(SGPR);
3584 std::next(
MI->getReverseIterator())))
3594 if (!WaitInstrs.
empty()) {
3598 SmallVector<MachineInstr *> ToErase;
3600 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3601 End = MI->getParent()->rend();
3602 Found < WaitInstrs.size() && It != End; ++It) {
3603 MachineInstr *WaitMI = &*It;
3605 if (std::as_const(WaitMI) != WaitInstrs[Found])
3608 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3609 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3610 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3611 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3612 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3613 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3614 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3615 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3616 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3617 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3618 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3619 ToErase.push_back(WaitMI);
3622 for (MachineInstr *WaitMI : ToErase)
3623 WaitMI->eraseFromParent();
3627 auto NextMI = std::next(
MI->getIterator());
3628 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3629 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3641 if (EntryMBB.
begin() != EntryMBB.
end()) {
3642 auto &EntryMI = *EntryMBB.
begin();
3643 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3644 EntryMI.getOperand(0).getImm() >= Priority)
3653bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3654 if (!ST.hasRequiredExportPriority())
3659 MachineBasicBlock *
MBB =
MI->getParent();
3672 const int MaxPriority = 3;
3673 const int NormalPriority = 2;
3674 const int PostExportPriority = 0;
3676 auto It =
MI->getIterator();
3677 switch (
MI->getOpcode()) {
3678 case AMDGPU::S_ENDPGM:
3679 case AMDGPU::S_ENDPGM_SAVED:
3680 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3681 case AMDGPU::SI_RETURN_TO_EPILOG:
3684 if (MF->getFrameInfo().hasCalls())
3687 case AMDGPU::S_SETPRIO: {
3689 auto &PrioOp =
MI->getOperand(0);
3690 int Prio = PrioOp.getImm();
3691 bool InWA = (Prio == PostExportPriority) &&
3692 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3693 if (InWA || Prio >= NormalPriority)
3695 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3699 if (!TII.isEXP(*
MI))
3710 auto NextMI = std::next(It);
3711 bool EndOfShader =
false;
3712 if (NextMI !=
MBB->
end()) {
3714 if (TII.isEXP(*NextMI))
3717 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3718 NextMI->getOperand(0).getImm() == PostExportPriority)
3720 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3727 .
addImm(PostExportPriority);
3731 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3732 .
addReg(AMDGPU::SGPR_NULL)
3752 const SIInstrInfo *TII = ST.getInstrInfo();
3764 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3769bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3770 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3773 const SIInstrInfo *TII = ST.getInstrInfo();
3775 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3777 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3778 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3784bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3787 if (!IsHazardRecognizerMode)
3790 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3791 const SIInstrInfo *TII = ST.getInstrInfo();
3793 const int FlatScrBaseWaitStates = 10;
3795 bool ReadsFlatScrLo =
3796 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3797 bool ReadsFlatScrHi =
3798 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3804 ReadsFlatScrLo =
true;
3807 ReadsFlatScrHi =
true;
3812 const MachineRegisterInfo &MRI = MF.getRegInfo();
3815 DenseSet<const MachineBasicBlock *> Visited;
3817 return MI.modifiesRegister(
Reg, TRI);
3822 auto IsSGPRDef = [TII, TRI, &MRI](
const MachineInstr &
MI) ->
unsigned {
3823 if (!TII->isSALU(
MI) && !TII->isVALU(
MI))
3825 for (
const MachineOperand &MO :
MI.all_defs()) {
3826 if (TRI->isSGPRReg(MRI, MO.getReg()))
3832 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3833 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3834 unsigned Wait =
MI.getOperand(0).getImm();
3839 return SgprWrites >= FlatScrBaseWaitStates;
3842 return ::getWaitStatesSince(
3843 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3844 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3848 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3850 !IsRegDefHazard(AMDGPU::SGPR103)))
3854 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3865 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3866 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const uint32_t IV[8]
unsigned get(InstCounterType T) const
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
constexpr RegState getDeadRegState(bool B)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...